I need to read some large files (from 50k to 100k lines), structured in

Question

0

Asked: May 11, 20262026-05-11T02:11:34+00:00 2026-05-11T02:11:34+00:00

I need to read some large files (from 50k to 100k lines), structured in

0

I need to read some large files (from 50k to 100k lines), structured in groups separated by empty lines. Each group start at the same pattern ‘No.999999999 dd/mm/yyyy ZZZ’. Here´s some sample data.

No.813829461 16/09/1987 270
Tit.SUZANO PAPEL E CELULOSE S.A. (BR/BA)
C.N.P.J./C.I.C./N INPI : 16404287000155
Procurador: MARCELLO DO NASCIMENTO

No.815326777 28/12/1989 351
Tit.SIGLA SISTEMA GLOBO DE GRAVACOES AUDIO VISUAIS LTDA (BR/RJ)
C.N.P.J./C.I.C./NºINPI : 34162651000108
Apres.: Nominativa ; Nat.: De Produto
Marca: TRIO TROPICAL
Clas.Prod/Serv: 09.40
*DEFERIDO CONFORME RESOLUÇÃO 123 DE 06/01/2006, PUBLICADA NA RPI 1829, DE 24/01/2006.
Procurador: WALDEMAR RODRIGUES PEDRA

No.900148764 11/01/2007 LD3
Tit.TIARA BOLSAS E CALÇADOS LTDA
Procurador: Marcia Ferreira Gomes
*Escritório: Marcas Marcantes e Patentes Ltda
*Exigência Formal não respondida Satisfatoriamente, Pedido de Registro de Marca considerado inexistente, de acordo com Art. 157 da LPI
*Protocolo da Petição de cumprimento de Exigência Formal: 810080140197

I wrote some code that´s parsing it accordingly. There´s anything that I can improve, to improve readability or performance? Here´s what I come so far:

import re, pprint  class Despacho(object):     '''     Class to parse each line, applying the regexp and storing the results     for future use     '''     regexp = {         re.compile(r'No.([\d]{9})  ([\d]{2}/[\d]{2}/[\d]{4})  (.*)'): lambda self: self._processo,         re.compile(r'Tit.(.*)'): lambda self: self._titular,         re.compile(r'Procurador: (.*)'): lambda self: self._procurador,         re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'): lambda self: self._documento,         re.compile(r'Apres.: (.*) ; Nat.: (.*)'): lambda self: self._apresentacao,         re.compile(r'Marca: (.*)'): lambda self: self._marca,         re.compile(r'Clas.Prod/Serv: (.*)'): lambda self: self._classe,         re.compile(r'\*(.*)'): lambda self: self._complemento,     }      def __init__(self):         '''         'complemento' is the only field that can be multiple in a single registry         '''         self.complemento = []      def _processo(self, matches):         self.processo, self.data, self.despacho = matches.groups()      def _titular(self, matches):         self.titular = matches.group(1)      def _procurador(self, matches):         self.procurador = matches.group(1)      def _documento(self, matches):         self.documento = matches.group(1)      def _apresentacao(self, matches):         self.apresentacao, self.natureza = matches.groups()      def _marca(self, matches):         self.marca = matches.group(1)      def _classe(self, matches):         self.classe = matches.group(1)      def _complemento(self, matches):         self.complemento.append(matches.group(1))      def read(self, line):         for pattern in Despacho.regexp:             m = pattern.match(line)             if m:                 Despacho.regexp[pattern](self)(m)   def process(rpi):     '''     read data and process each group     '''     rpi = (line for line in rpi)     group = False      for line in rpi:         if line.startswith('No.'):             group = True             d = Despacho()                  if not line.strip() and group: # empty line - end of block             yield d             group = False          d.read(line)   arquivo = open('rm1972.txt') # file to process for desp in process(arquivo):     pprint.pprint(desp.__dict__)     print('--------------')

Report

Leave an answer
Cancel reply

You must login to add an answer.

Need An Account,

1 Answer

score 0 · Answer 1 · 2026-05-11T02:11:35+00:00

That is pretty good. Below some suggestions, let me know if you like’em:

import re import pprint import sys  class Despacho(object):     '''     Class to parse each line, applying the regexp and storing the results     for future use     '''     #used a dict with the keys instead of functions.     regexp = {         ('processo',           'data',           'despacho'): re.compile(r'No.([\d]{9})  ([\d]{2}/[\d]{2}/[\d]{4})  (.*)'),         ('titular',): re.compile(r'Tit.(.*)'),         ('procurador',): re.compile(r'Procurador: (.*)'),         ('documento',): re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'),         ('apresentacao',          'natureza'): re.compile(r'Apres.: (.*) ; Nat.: (.*)'),         ('marca',): re.compile(r'Marca: (.*)'),         ('classe',): re.compile(r'Clas.Prod/Serv: (.*)'),         ('complemento',): re.compile(r'\*(.*)'),     }      def __init__(self):         '''         'complemento' is the only field that can be multiple in a single registry         '''         self.complemento = []       def read(self, line):         for attrs, pattern in Despacho.regexp.iteritems():             m = pattern.match(line)             if m:                 for groupn, attr in enumerate(attrs):                     # special case complemento:                     if attr == 'complemento':                         self.complemento.append(m.group(groupn + 1))                     else:                         # set the attribute on the object                         setattr(self, attr, m.group(groupn + 1))      def __repr__(self):         # defines object printed representation         d = {}         for attrs in self.regexp:             for attr in attrs:                 d[attr] = getattr(self, attr, None)         return pprint.pformat(d)  def process(rpi):     '''     read data and process each group     '''     #Useless line, since you're doing a for anyway     #rpi = (line for line in rpi)     group = False      for line in rpi:         if line.startswith('No.'):             group = True             d = Despacho()                  if not line.strip() and group: # empty line - end of block             yield d             group = False          d.read(line)  def main():     arquivo = open('rm1972.txt') # file to process     for desp in process(arquivo):         print desp # can print directly here.         print('-' * 20)     return 0  if __name__ == '__main__':     main()

Sign Up

Sign In

Forgot Password

The Archive Base Latest Questions

I need to read some large files (from 50k to 100k lines), structured in

Leave an answerCancel reply

1 Answer

Leave an answer
Cancel reply