1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
| import re # import re (regular expression) module
from lxml import etree
# read text file:
with open('bib.txt', encoding='utf-8') as myFile:
data = myFile.read()
# split text into non-empty lines (marc fields):
marcFields = [line for line in re.split("\n", data) if line.strip()] # remove blank/whitespace lines
# create xml root node:
xmlRootNode = etree.Element('record', xmlns = 'http://www.loc.gov/MARC21/slim')
# add xml root node content:
fieldIndex = 0
while fieldIndex < len(marcFields):
marcField = marcFields[fieldIndex]
marcTag = marcField[0:3]
fieldIndex += 1
# extract leader field:
if marcTag == 'LDR':
ldrText = marcField[4:]
etree.SubElement(xmlRootNode, 'leader').text = ldrText #add leader xml subelement.
continue
# skip directory field:
if marcTag == 'DIR':
continue
# extract control fields:
if re.search('00\d', marcTag): # alternatively if marcTag[0:2] == '00'
ctrlFieldText = marcField[4:]
etree.SubElement(xmlRootNode, 'controlfield', tag = marcTag).text = ctrlFieldText #add control field xml subelement.
continue
# extract variable fields:
indicator1 = marcField[4:5] # extract indicator 1.
indicator2 = marcField[5:6] # extract indicator 2.
dataField = etree.SubElement(xmlRootNode, 'datafield', tag = marcTag, ind1 = indicator1, ind2 = indicator2) #add datafield xml subelement.
# extract subfields:
rawSubfields = [line for line in re.split('ǂ', marcField[6:]) if line.strip()] # remove blank/whitespace lines.
subfieldIndex = 0
while subfieldIndex < len(rawSubfields):
rawSubfield = rawSubfields[subfieldIndex]
marcSubfieldTag = rawSubfield[0]
marcSubfieldData = rawSubfield[2:].strip()
subfieldIndex += 1
subfield = etree.SubElement(dataField, 'subfield', code = marcSubfieldTag).text = marcSubfieldData #add subfield xml subelement.
# generate xml file:
fileContent = etree.tostring(xmlRootNode, encoding='unicode', pretty_print=True)
with open('bib.xml', 'w', encoding = 'utf-8') as myFile: # open file for writing.
myFile.write(fileContent) # specify string to write.
# print xml file:
myFile = open('bib.xml', encoding = 'utf-8')
print(myFile.read())
|