Lexical analyzers are quite easy to write when you have regexes. Today I wanted to write a simple general analyzer in Python, and came up with:
import re import sys class Token(object): ''' A simple Token structure. Contains the token type, value and position. ''' def __init__(self, type, val, pos): self.type = type self.val = val self.pos = pos def __str__(self): return '%s(%s) at %s' % (self.type, self.val, self.pos) class LexerError(Exception): ''' Lexer error exception. pos: Position in the input line where the error occurred. ''' def __init__(self, pos): self.pos = pos class Lexer(object): ''' A simple regex-based lexer/tokenizer. See below for an example of usage. ''' def __init__(self, rules, skip_whitespace=True): ''' Create a lexer. rules: A list of rules. Each rule is a `regex, type` pair, where `regex` is the regular expression used to recognize the token and `type` is the type of the token to return when it's recognized. skip_whitespace: If True, whitespace (\s+) will be skipped and not reported by the lexer. Otherwise, you have to specify your rules for whitespace, or it will be flagged as an error. ''' self.rules = [] for regex, type in rules: self.rules.append((re.compile(regex), type)) self.skip_whitespace = skip_whitespace self.re_ws_skip = re.compile('\S') def input(self, buf): ''' Initialize the lexer with a buffer as input. ''' self.buf = buf self.pos = 0 def token(self): ''' Return the next token (a Token object) found in the input buffer. None is returned if the end of the buffer was reached. In case of a lexing error (the current chunk of the buffer matches no rule), a LexerError is raised with the position of the error. ''' if self.pos >= len(self.buf): return None else: if self.skip_whitespace: m = self.re_ws_skip.search(self.buf[self.pos:]) if m: self.pos += m.start() else: return None for token_regex, token_type in self.rules: m = token_regex.match(self.buf[self.pos:]) if m: value = self.buf[self.pos + m.start():self.pos + m.end()] tok = Token(token_type, value, self.pos) self.pos += m.end() return tok # if we're here, no rule matched raise LexerError(self.pos) def tokens(self): ''' Returns an iterator to the tokens found in the buffer. ''' while 1: tok = self.token() if tok is None: break yield tok if __name__ == '__main__': rules = [ ('\d+', 'NUMBER'), ('[a-zA-Z_]\w+', 'IDENTIFIER'), ('\+', 'PLUS'), ('\-', 'MINUS'), ('\*', 'MULTIPLY'), ('\/', 'DIVIDE'), ('\(', 'LP'), ('\)', 'RP'), ('=', 'EQUALS'), ] lx = Lexer(rules, skip_whitespace=True) lx.input('erw = _abc + 12*(R4-623902) ') try: for tok in lx.tokens(): print tok except LexerError, err: print 'LexerError at position', err.pos
It works just fine, but I’m a bit worried that it’s too inefficient. Are there any regex tricks that will allow me to write it in a more efficient / elegant way ?
Specifically, is there a way to avoid looping over all the regex rules linearly to find one that fits?
You can merge all your regexes into one using the ‘|’ operator and let the regex library do the work of discerning between tokens. Some care should be taken to ensure the preference of tokens (for example to avoid matching a keyword as an identifier).