Extracting Info From Large Structured Text Files
Solution 1:
That is pretty good. Below some suggestions, let me know if you like'em:
import re
import pprint
import sys
classDespacho(object):
"""
Class to parse each line, applying the regexp and storing the results
for future use
"""#used a dict with the keys instead of functions.
regexp = {
('processo',
'data',
'despacho'): re.compile(r'No.([\d]{9}) ([\d]{2}/[\d]{2}/[\d]{4}) (.*)'),
('titular',): re.compile(r'Tit.(.*)'),
('procurador',): re.compile(r'Procurador: (.*)'),
('documento',): re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'),
('apresentacao',
'natureza'): re.compile(r'Apres.: (.*) ; Nat.: (.*)'),
('marca',): re.compile(r'Marca: (.*)'),
('classe',): re.compile(r'Clas.Prod/Serv: (.*)'),
('complemento',): re.compile(r'\*(.*)'),
}
def__init__(self):
"""
'complemento' is the only field that can be multiple in a single registry
"""
self.complemento = []
defread(self, line):
for attrs, pattern in Despacho.regexp.iteritems():
m = pattern.match(line)
if m:
for groupn, attr inenumerate(attrs):
# special case complemento:if attr == 'complemento':
self.complemento.append(m.group(groupn + 1))
else:
# set the attribute on the objectsetattr(self, attr, m.group(groupn + 1))
def__repr__(self):
# defines object printed representation
d = {}
for attrs in self.regexp:
for attr in attrs:
d[attr] = getattr(self, attr, None)
return pprint.pformat(d)
defprocess(rpi):
"""
read data and process each group
"""#Useless line, since you're doing a for anyway#rpi = (line for line in rpi)
group = Falsefor line in rpi:
if line.startswith('No.'):
group = True
d = Despacho()
ifnot line.strip() and group: # empty line - end of blockyield d
group = False
d.read(line)
defmain():
arquivo = open('rm1972.txt') # file to processfor desp in process(arquivo):
print desp # can print directly here.print('-' * 20)
return0if __name__ == '__main__':
main()
Solution 2:
It would be easier to help if you had a specific concern. Performance will depend greatly on the efficiency of the particular regex engine you are using. 100K lines in a single file doesn't sound that big, but again it all depends on your environment.
I use Expresso in my .NET development to test expressions for accuracy and performance. A Google search turned up Kodos, a GUI Python regex authoring tool.
Solution 3:
It looks good overall, but why do you have the line:
rpi = (line for line in rpi)
You can already iterate over the file object without this intermediate step.
Solution 4:
I wouldn't use regex here. If you know that your lines will be starting with fixed strings, why not check those strings and write a logic around it?
for line in open(file):
if line[0:3]=='No.':
currIndex='No'map['No']=line[4:]
....
...
elseif line.strip()=='':
//store the record in the mapand clear the mapelse:
//append line to the lastindex in map.. this is when the record overflows to the next line.
Map[currIndex]=Map[currIndex]+"\n"+line
Consider the above code as just the pseudocode.
Solution 5:
Another version with only one combined regular expression:
#!/usr/bin/pythonimport re
import pprint
import sys
classDespacho(object):
"""
Class to parse each line, applying the regexp and storing the results
for future use
"""#used a dict with the keys instead of functions.
regexp = re.compile(
r'No.(?P<processo>[\d]{9}) (?P<data>[\d]{2}/[\d]{2}/[\d]{4}) (?P<despacho>.*)'r'|Tit.(?P<titular>.*)'r'|Procurador: (?P<procurador>.*)'r'|C.N.P.J./C.I.C./N INPI :(?P<documento>.*)'r'|Apres.: (?P<apresentacao>.*) ; Nat.: (?P<natureza>.*)'r'|Marca: (?P<marca>.*)'r'|Clas.Prod/Serv: (?P<classe>.*)'r'|\*(?P<complemento>.*)')
simplefields = ('processo', 'data', 'despacho', 'titular', 'procurador',
'documento', 'apresentacao', 'natureza', 'marca', 'classe')
def__init__(self):
"""
'complemento' is the only field that can be multiple in a single
registry
"""
self.__dict__ = dict.fromkeys(self.simplefields)
self.complemento = []
defparse(self, line):
m = self.regexp.match(line)
if m:
gd = dict((k, v) for k, v in m.groupdict().items() if v)
if'complemento'in gd:
self.complemento.append(gd['complemento'])
else:
self.__dict__.update(gd)
def__repr__(self):
# defines object printed representationreturn pprint.pformat(self.__dict__)
defprocess(rpi):
"""
read data and process each group
"""
d = Nonefor line in rpi:
if line.startswith('No.'):
if d:
yield d
d = Despacho()
d.parse(line)
yield d
defmain():
arquivo = file('rm1972.txt') # file to processfor desp in process(arquivo):
print desp # can print directly here.print'-' * 20if __name__ == '__main__':
main()
Post a Comment for "Extracting Info From Large Structured Text Files"