# -*- coding: utf-8 -*-
"""Lexer for bib(la)tex.
Port of the lexer from bibtex-ruby with a few changes. This lexer also supports
parentheses instead of braces for string, preamble and comment entries, e.g.
'@string(var = 1)' and generates tokens rather returning a list.
"""
from bibpy.lexers.base_lexer import BaseLexer
# A custom lexer is necessary as funcparserlib's lexing infrastructure is
# not up to the task of lexing bib(la)tex's somewhat complicated
# context-dependent structure like nested braces and comments.
[docs]class BibLexer(BaseLexer):
"""Lexer for generating bib tokens."""
def __init__(self):
"""Initialise the lexer."""
super().__init__()
self.reset('')
self.mode = 'comment'
self._modes = {
'bib': self.lex_main,
'entry': self.lex_entry,
'value': self.lex_braced,
'parens': self.lex_parens,
'comment': self.lex_comment
}
self._compile_regexes([
('lbrace', (r'{', self.lex_lbrace)),
('rbrace', (r'}', self.lex_rbrace)),
('equals', (r'\s*(=)\s*', None)),
('comma', (r',', None)),
('number', (r'-?(0|([1-9][0-9]*))', None)),
('name', (r"[ ]*[\w\-:?'\.]+[ ]*", None)),
('entry', (r'@', self.found_entry)),
('string', (r'"[^"]+"', self.lex_string)),
('lparen', (r'\(', self.lex_lparen)),
('rparen', (r'\)', self.lex_rparen)),
('concat', (r'[ ]*#[ ]*', None)),
('space', (r'[ \t\r\n]+', None)),
])
[docs] def reset(self, string):
"""Reset the internal state of the lexer."""
super().reset(string)
self.in_entry = False
[docs] def found_entry(self, value):
"""Handler for finding a bibliographic entry."""
self.in_entry = True
self.ignore_whitespace = True
return self.make_token('entry', value)
[docs] def lex_lbrace(self, value):
"""Lex a left brace."""
self.brace_level += 1
if self.brace_level == 1 and self.bibtype in ('comment', 'preamble'):
self.mode = 'value'
elif self.brace_level > 1:
self.mode = 'value'
return self.make_token('lbrace', value)
[docs] def lex_rbrace(self, value):
"""Lex a right brace."""
self.brace_level -= 1
if self.brace_level == 0:
self.in_entry = False
self.mode = 'comment'
elif self.brace_level < 0:
raise self.raise_unbalanced()
return self.make_token('rbrace', value)
[docs] def lex_lparen(self, value):
"""Lex a left parenthesis."""
if self.bibtype == 'string':
self.mode = 'bib'
self.ignore_whitespace = True
elif self.bibtype in ('comment', 'preamble'):
self.mode = 'parens'
return self.make_token('lparen', value)
[docs] def lex_rparen(self, value):
"""Lex a right parenthesis."""
return self.make_token('rparen', value)
[docs] def lex_parens(self):
"""Lex a set of possibly nested parentheses and its contents."""
paren_level = 1
content = ''
while True:
before, token = self.until('parens')
if token == '(':
paren_level += 1
content += before + token
elif token == ')':
paren_level -= 1
content += before
if paren_level == 0:
yield self.make_token('content', content)
yield self.make_token('rparen', token)
self.mode = 'bib'
break
[docs] def lex_braced(self):
"""Lex a possibly nested braced expression and its contents."""
content = ''
while True:
before, token = self.until('braces')
if token == '{':
self.brace_level += 1
content += before + token
elif token == '}':
self.brace_level -= 1
content += before
if self.brace_level == 0:
yield self.make_token('content', content)
yield self.make_token('rbrace', token)
self.in_entry = False
self.ignore_whitespace = False
self.mode = 'comment'
self.bibtype = None
break
elif self.brace_level == 1 and self.bibtype not in\
('comment', 'string', 'preamble'):
yield self.make_token('content', content)
yield self.make_token('rbrace', token)
self.mode = 'bib'
break
else:
content += token
[docs] def lex_entry(self):
"""Lex a bibliographic entry."""
self.brace_level = 0
bibtype = self.expect('name')
entry_type = bibtype.value.lower()
if entry_type in ('comment', 'preamble', 'string'):
self.bibtype = entry_type
else:
self.bibtype = 'entry'
yield bibtype
self.mode = 'bib'
self.ignore_whitespace = True
[docs] def lex_main(self):
for _, token in self.scan(search_type='match'):
if token is not None:
yield token
else:
self.raise_error('Unmatched characters')