I am trying to split large xml file into smaller ones, first I started off beautifulsoup:
from bs4 import BeautifulSoup
import os
# Core settings
rootdir = r'C:\Users\XX\Documents\Grant Data\2010_xml'
extension = ".xml"
to_save = r'C:\Users\XX\Documents\all_patents_as_xml'
index = 0
for root, dirs, files in os.walk(rootdir):
for file in files:
if file.endswith(extension):
print(file)
file_name = os.path.join(root,file)
with open(file_name) as f:
data = f.read()
texts = data.split('?xml version="1.0" encoding="UTF-8"?')
for text in texts:
index += 1
filename = to_save + "\\"+ str(index) + ".txt"
with open(filename, 'w') as f:
f.write(text)
However, I got a memory error. Then I switched to xml etree:
from xml.etree import ElementTree as ET
import re
file_name = r'C:\Users\XX\Documents\Grant Data\2010_xml\2010cat_xml.xml'
with open(file_name) as f:
xml = f.read()
tree = ET.fromstring(re.sub(r"(<\?xml[^>]+\?>)", r"\1<root>", xml) + "</root>")
parser = ET.iterparse(tree)
to_save = r'C:\Users\Yilmaz\Documents\all_patents_as_xml'
index = 0
for event, element in parser:
# element is a whole element
if element.tag == '?xml version="1.0" encoding="UTF-8"?':
index += 1
filename = to_save + "\\"+ str(index) + ".txt"
with open(filename, 'w') as f:
f.write(ET.tostring(element))
# do something with this element
# then clean up
element.clear()
and I get the following error:
OverflowError: size does not fit in an int
I am using windows operating system, I know in Linux you can split the xmls from consule but in my case I don't know what to do.
from Splitting large xml file into multiple files by using beautifulsoup
No comments:
Post a Comment