Wednesday, 19 June 2019

Splitting large xml file into multiple files by using beautifulsoup

I am trying to split large xml file into smaller ones, first I started off beautifulsoup:

from bs4 import BeautifulSoup
import os
# Core settings
rootdir = r'C:\Users\XX\Documents\Grant Data\2010_xml'
extension = ".xml"
to_save = r'C:\Users\XX\Documents\all_patents_as_xml'

index = 0
for root, dirs, files in os.walk(rootdir):
    for file in files:
        if file.endswith(extension):
            print(file)
            file_name = os.path.join(root,file)
            with open(file_name) as f:
                data = f.read()
            texts = data.split('?xml version="1.0" encoding="UTF-8"?')
            for text in texts:
                index += 1
                filename = to_save + "\\"+ str(index) + ".txt"
                with open(filename, 'w') as f:
                    f.write(text)

However, I got a memory error. Then I switched to xml etree:

from xml.etree import ElementTree as ET
import re


file_name = r'C:\Users\XX\Documents\Grant Data\2010_xml\2010cat_xml.xml'


with open(file_name) as f:
    xml = f.read()
tree = ET.fromstring(re.sub(r"(<\?xml[^>]+\?>)", r"\1<root>", xml) + "</root>")
parser = ET.iterparse(tree)
to_save = r'C:\Users\Yilmaz\Documents\all_patents_as_xml'
index = 0
for event, element in parser:
    # element is a whole element
    if element.tag == '?xml version="1.0" encoding="UTF-8"?':
        index += 1
        filename = to_save + "\\"+ str(index) + ".txt"
        with open(filename, 'w') as f:
            f.write(ET.tostring(element))
        # do something with this element
        # then clean up
        element.clear()

and I get the following error:

OverflowError: size does not fit in an int

I am using windows operating system, I know in Linux you can split the xmls from consule but in my case I don't know what to do.



from Splitting large xml file into multiple files by using beautifulsoup

No comments:

Post a Comment