Source code for bibpy.lexers

# -*- coding: utf-8 -*-

"""Various lexer functions used by the parsers."""

from bibpy.lexers.biblexer import BibLexer
from bibpy.lexers.name_lexer import NameLexer
from bibpy.lexers.namelist_lexer import NamelistLexer
import funcparserlib.lexer as lexer
from funcparserlib.lexer import Token


[docs]def remove_whitespace_tokens(tokens): """Remove any whitespace tokens from a list of tokens.""" return [token for token in tokens if token.type != 'space']
[docs]def lex_bib(string): """Lex a string into bib tokens.""" return BibLexer().lex(string)
[docs]def lex_date(date_string): """Lex a string into biblatex date tokens.""" tokenizer = lexer.make_tokenizer([ ('number', [r'[0-9]+']), ('dash', [r'-']), ('slash', [r'/']) ]) return tokenizer(date_string)
[docs]def lex_string_expr(string): """Lex a string expression.""" tokenizer = lexer.make_tokenizer([ ('concat', [r'#']), ('string', [r'"[^"]+"']), ('name', [r'[A-Za-z_][A-Za-z_0-9\-:?\'\.\s]*']), ('space', [r'[ \t\r\n]+']), ]) try: return remove_whitespace_tokens(tokenizer(string)) except lexer.LexerError: # If we fail to lex the string, it is not a valid string expression so # just return it as a single token return [Token('string', string)]
[docs]def lex_braced_expr(string): """Lex a braced expression.""" tokenizer = lexer.make_tokenizer([ ('lbrace', [r'{']), ('rbrace', [r'}']), ('content', [r'[^{}]+']), ]) return remove_whitespace_tokens(tokenizer(string))
[docs]def lex_namelist(string): """Lex a namelist delimited by zero brace-level 'and'.""" return NamelistLexer().lex(string)
[docs]def lex_name(string): """Lex a name into parts.""" name_lexer = NameLexer() # We have to force evaluation here so commas are also evaluated return list(name_lexer.lex(string)), name_lexer.commas
[docs]def lex_generic_query(query): """Lex a query string. Used by bibpy's accompanying tools. """ tokenizer = lexer.make_tokenizer([ ('not', [r'\^']), ('equals', [r'=']), ('approx', [r'~']), ('le', [r'<=']), ('lt', [r'<']), ('ge', [r'>=']), ('gt', [r'>']), ('comma', [r',']), ('dash', [r'-']), ('number', [r'-?(0|([1-9][0-9]*))']), ('name', [r'\w+']), ('space', [r'[ \t\r\n]+']), ('any', [r'[^<><=>=\s=\^~]+']) ]) return remove_whitespace_tokens(tokenizer(query))