Python RegEx Project
   6 min read

Project description

Use regular expressions to parse a bibliographic MARC record (text format) and convert the record to XML using the Python lxml library.

Python code

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import re # import re (regular expression) module
from lxml import etree

# read text file:
with open('bib.txt', encoding='utf-8') as myFile:
    data = myFile.read()

# split text into non-empty lines (marc fields):
marcFields = [line for line in re.split("\n", data) if line.strip()]  # remove blank/whitespace lines

# create xml root node:
xmlRootNode = etree.Element('record', xmlns = 'http://www.loc.gov/MARC21/slim') 

# add xml root node content:
fieldIndex = 0
while fieldIndex < len(marcFields):
    marcField = marcFields[fieldIndex]
    marcTag = marcField[0:3]
    fieldIndex += 1

    # extract leader field:
    if marcTag == 'LDR':
        ldrText = marcField[4:]
        etree.SubElement(xmlRootNode, 'leader').text = ldrText  #add leader xml subelement.
        continue
    
    # skip directory field:
    if marcTag == 'DIR':
        continue

    # extract control fields:
    if re.search('00\d', marcTag):  # alternatively if marcTag[0:2] == '00'
        ctrlFieldText = marcField[4:]  
        etree.SubElement(xmlRootNode, 'controlfield', tag = marcTag).text = ctrlFieldText  #add control field xml subelement.
        continue
    
    # extract variable fields:
    indicator1 = marcField[4:5]  # extract indicator 1.
    indicator2 = marcField[5:6]  # extract indicator 2.   
    dataField = etree.SubElement(xmlRootNode, 'datafield', tag = marcTag, ind1 = indicator1, ind2 = indicator2)  #add datafield xml subelement.
    # extract subfields:
    rawSubfields = [line for line in re.split('ǂ', marcField[6:]) if line.strip()]  # remove blank/whitespace lines.
    subfieldIndex = 0
    while subfieldIndex < len(rawSubfields):
        rawSubfield = rawSubfields[subfieldIndex]
        marcSubfieldTag = rawSubfield[0]
        marcSubfieldData = rawSubfield[2:].strip()
        subfieldIndex += 1
        subfield = etree.SubElement(dataField, 'subfield', code = marcSubfieldTag).text = marcSubfieldData  #add subfield xml subelement.

# generate xml file:
fileContent = etree.tostring(xmlRootNode, encoding='unicode', pretty_print=True)
with open('bib.xml', 'w', encoding = 'utf-8') as myFile:  # open file for writing.
    myFile.write(fileContent)  # specify string to write.

# print xml file:
myFile = open('bib.xml', encoding = 'utf-8')
print(myFile.read())

Text file

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
LDR 01579nam a2200397 i 4500
DIR 001001900000003000800019005001700027008004100044010001800085020003100103020002800134035003200162040003100194043001200225100003900237245004300276250001900319264006000338264001100398300002400409336002600433337002800459338002700487490002800514500007700542520009900619588007400718650003600792650005000828600002700878650003100905650004600936655004200982655003101024907012601055
001 BIB-0000-0000-0152
003 ObibOrg
005 20201001000000.0
008 201001t20182018nyu 000 f eng d
010    ǂa 2018945140
020    ǂa 9780316484800 ǂq (hardcover)
020    ǂa 0316484806 ǂq (hardcover)
035    ǂa (ObibOrg)BIB-0000-0000-0152
040    ǂa ObibOrg ǂb eng ǂe rda ǂc ObibOrg
043    ǂa n-us-ca
100 1  ǂa Connelly, Michael, ǂd 1956- ǂe author.
245 10 ǂa Dark sacred night / ǂc Michael Connelly.
250    ǂa First edition.
264  1 ǂa New York, New York : ǂb Little, Brown and Company, ǂc 2018.
264  4 ǂc ©2018
300    ǂa 433 pages ; ǂc 25 cm.
336    ǂa text ǂb txt ǂ2 rdacontent
337    ǂa unmediated ǂb n ǂ2 rdamedia
338    ǂa volume ǂb nc ǂ2 rdacarrier
490 0  ǂa Ballard and Bosch novel
500    ǂa This is a Ballard and Bosch Novel, a spin-off of the Harry Bosch series.
520 0  ǂa Harry Bosch and Renée Ballard team up to investigate the homicide of a runaway teenage girl.
588    ǂa This bibliographic record is licensed under Creative Commons CC0 1.0.
650  0 ǂa Murder ǂx Investigation ǂv Fiction.
650  0 ǂa Cold cases (Criminal investigation) ǂv Fiction.
600 10 ǂa Bosch, Harry ǂv Fiction.
650  0 ǂa Women detectives ǂv Fiction.
650  0 ǂa Police ǂz California ǂz Los Angeles ǂv Fiction.
655  7 ǂa Detective and mystery fiction. ǂ2 lcgft
655  7 ǂa Thrillers (Fiction) ǂ2 lcgft
907    ǂa Novel ǂb Mystery ǂb Suspense / Thriller ǂc Cold case ǂc Detective ǂc Imaginary person ǂc Investigation ǂc Murder ǂc North America ǂc Police

Generated XML file

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
<record xmlns="http://www.loc.gov/MARC21/slim">
  <leader>01579nam a2200397 i 4500</leader>
  <controlfield tag="001">BIB-0000-0000-0152</controlfield>
  <controlfield tag="003">ObibOrg</controlfield>
  <controlfield tag="005">20201001000000.0</controlfield>
  <controlfield tag="008">201001t20182018nyu 000 f eng d</controlfield>
  <datafield tag="010" ind1=" " ind2=" ">
    <subfield code="a">2018945140</subfield>
  </datafield>
  <datafield tag="020" ind1=" " ind2=" ">
    <subfield code="a">9780316484800</subfield>
    <subfield code="q">(hardcover)</subfield>
  </datafield>
  <datafield tag="020" ind1=" " ind2=" ">
    <subfield code="a">0316484806</subfield>
    <subfield code="q">(hardcover)</subfield>
  </datafield>
  <datafield tag="035" ind1=" " ind2=" ">
    <subfield code="a">(ObibOrg)BIB-0000-0000-0152</subfield>
  </datafield>
  <datafield tag="040" ind1=" " ind2=" ">
    <subfield code="a">ObibOrg</subfield>
    <subfield code="b">eng</subfield>
    <subfield code="e">rda</subfield>
    <subfield code="c">ObibOrg</subfield>
  </datafield>
  <datafield tag="043" ind1=" " ind2=" ">
    <subfield code="a">n-us-ca</subfield>
  </datafield>
  <datafield tag="100" ind1="1" ind2=" ">
    <subfield code="a">Connelly, Michael,</subfield>
    <subfield code="d">1956-</subfield>
    <subfield code="e">author.</subfield>
  </datafield>
  <datafield tag="245" ind1="1" ind2="0">
    <subfield code="a">Dark sacred night /</subfield>
    <subfield code="c">Michael Connelly.</subfield>
  </datafield>
  <datafield tag="250" ind1=" " ind2=" ">
    <subfield code="a">First edition.</subfield>
  </datafield>
  <datafield tag="264" ind1=" " ind2="1">
    <subfield code="a">New York, New York :</subfield>
    <subfield code="b">Little, Brown and Company,</subfield>
    <subfield code="c">2018.</subfield>
  </datafield>
  <datafield tag="264" ind1=" " ind2="4">
    <subfield code="c">©2018</subfield>
  </datafield>
  <datafield tag="300" ind1=" " ind2=" ">
    <subfield code="a">433 pages ;</subfield>
    <subfield code="c">25 cm.</subfield>
  </datafield>
  <datafield tag="336" ind1=" " ind2=" ">
    <subfield code="a">text</subfield>
    <subfield code="b">txt</subfield>
    <subfield code="2">rdacontent</subfield>
  </datafield>
  <datafield tag="337" ind1=" " ind2=" ">
    <subfield code="a">unmediated</subfield>
    <subfield code="b">n</subfield>
    <subfield code="2">rdamedia</subfield>
  </datafield>
  <datafield tag="338" ind1=" " ind2=" ">
    <subfield code="a">volume</subfield>
    <subfield code="b">nc</subfield>
    <subfield code="2">rdacarrier</subfield>
  </datafield>
  <datafield tag="490" ind1="0" ind2=" ">
    <subfield code="a">Ballard and Bosch novel</subfield>
  </datafield>
  <datafield tag="500" ind1=" " ind2=" ">
    <subfield code="a">This is a Ballard and Bosch Novel, a spin-off of the Harry Bosch series.</subfield>
  </datafield>
  <datafield tag="520" ind1="0" ind2=" ">
    <subfield code="a">Harry Bosch and Renée Ballard team up to investigate the homicide of a runaway teenage girl.</subfield>
  </datafield>
  <datafield tag="588" ind1=" " ind2=" ">
    <subfield code="a">This bibliographic record is licensed under Creative Commons CC0 1.0.</subfield>
  </datafield>
  <datafield tag="650" ind1=" " ind2="0">
    <subfield code="a">Murder</subfield>
    <subfield code="x">Investigation</subfield>
    <subfield code="v">Fiction.</subfield>
  </datafield>
  <datafield tag="650" ind1=" " ind2="0">
    <subfield code="a">Cold cases (Criminal investigation)</subfield>
    <subfield code="v">Fiction.</subfield>
  </datafield>
  <datafield tag="600" ind1="1" ind2="0">
    <subfield code="a">Bosch, Harry</subfield>
    <subfield code="v">Fiction.</subfield>
  </datafield>
  <datafield tag="650" ind1=" " ind2="0">
    <subfield code="a">Women detectives</subfield>
    <subfield code="v">Fiction.</subfield>
  </datafield>
  <datafield tag="650" ind1=" " ind2="0">
    <subfield code="a">Police</subfield>
    <subfield code="z">California</subfield>
    <subfield code="z">Los Angeles</subfield>
    <subfield code="v">Fiction.</subfield>
  </datafield>
  <datafield tag="655" ind1=" " ind2="7">
    <subfield code="a">Detective and mystery fiction.</subfield>
    <subfield code="2">lcgft</subfield>
  </datafield>
  <datafield tag="655" ind1=" " ind2="7">
    <subfield code="a">Thrillers (Fiction)</subfield>
    <subfield code="2">lcgft</subfield>
  </datafield>
  <datafield tag="907" ind1=" " ind2=" ">
    <subfield code="a">Novel</subfield>
    <subfield code="b">Mystery</subfield>
    <subfield code="b">Suspense / Thriller</subfield>
    <subfield code="c">Cold case</subfield>
    <subfield code="c">Detective</subfield>
    <subfield code="c">Imaginary person</subfield>
    <subfield code="c">Investigation</subfield>
    <subfield code="c">Murder</subfield>
    <subfield code="c">North America</subfield>
    <subfield code="c">Police</subfield>
  </datafield>
</record>