"""
:mod:`bibstuff.bibfile`: High level BibTeX file interface
---------------------------------------------------------
Provides two classes, BibFile and BibEntry for accessing the parts of a bibtex
database. BibFile inherits from ``simpleparse.dispatchprocessor``. To fill a
BibFile instance, bfi, call bibgrammar.Parse(src, bfi).
:copyright: Dylan Schwilk (esp. BibFile) and Alan G Isaac (esp. BibEntry), see AUTHORS
:license: MIT (see LICENSE)
:requires: Python 2.4+
:TODO: make this framework more general, perhaps along the lines of the btparse library in `btOOL <http://www.gerg.ca/software/btOOL/>`_
"""
__docformat__ = "restructuredtext en"
__authors__ = ["Dylan W. Schwilk", "Alan G. Isaac"]
__version__ = '1.13'
__needs__ = '2.4'
# options:
# __strict__ = False allows empty citekeys
__strict__ = False # should we be strict with bibtex format?
####################### IMPORTS #####################################
# import from standard library
import re, logging
bibfile_logger = logging.getLogger('bibstuff_logger')
# import dependencies
from simpleparse.dispatchprocessor import dispatch, DispatchProcessor, getString, lines
#bibstuff imports
# from . import bibgrammar
#####################################################################
############### GLOBAL VARIABLES ##################################
months_en = ('January','February','March','April','May','June',
'July','August','September','October','November','December')
monthslower_en = [m.lower() for m in months_en]
monthmacros_en = [m[:3] for m in monthslower_en]
MONTH_DICT = dict( zip(monthmacros_en, months_en) )
#####################################################################
[docs]class BibEntry(dict):
"""
Stores a single bibliographic entry.
Provides a dictionary interface to the fields:
field keys are case-insensitive and fields are stored
in the order added.
:note: 2006-08-10 use 'citekey' instead of 'key' since BibTeX allows a 'key' field
:note: 2008-03-29 'entry_type' instead of 'type' since BibTeX allows a 'type' field
"""
def __init__(self,*args,**kwargs):
dict.__init__(self,*args,**kwargs)
self._fields = []
def __repr__(self):
"""return string representation of entry
"""
stringrep = '@%s{%s,\n' % (self.entry_type.upper() , self.citekey)
try:
mlen = max( len(key_str) for key_str in self._fields ) # for pretty format
except ValueError: #no fields (not a true entry)
mlen = 0
bibfile_logger.warn("Entry apparently has no fields.")
field_list = []
for key in self._fields:
addbraces = True
addquotes = False
#spacer = ' '*(mlen - len(key) )
val = self[key]
#handle crossref
if key == 'crossref':
try: val = val['citekey'] #might be an entry
except TypeError: pass #->must be a string
elif key == 'journal':
if val.isalpha() and val.islower(): #:TODO: allow punctuation!!
addbraces = False #i.e., assume it is a macro
elif key == 'month':
# always use month macros if possible
if val.lower() in monthslower_en + monthmacros_en:
val = val[:3].lower()
addbraces = False
elif key in ("year","number","volume","chapter"):
try:
addbraces = not int(val)
except:
pass
if '@' in val: # need to protect '@'
addquotes = True
if addquotes:
val = '"' + val + '"'
elif addbraces:
val = "{" + val + "}"
field_list.append(" %-*s = %s" % (mlen, key, val))
stringrep += ",\n".join(field_list)
stringrep += '\n}\n'
return stringrep
def __setitem__(self, key, val):
key = key.lower()
dict.__setitem__(self, key, val)
if key == "key":
bibfile_logger.info(
"Setting 'key' as an entry *field*. (Recall 'citekey' holds the entry id.)")
if key not in self._fields and key not in ["citekey","entry_type"] and val:
self._fields.append(key)
def __getitem__(self, field): #field is usually a BibTeX field but can be a citekey
field = field.lower()
if field == "key":
bibfile_logger.info(
"Seeking 'key' as an entry *field*. (Recall 'citekey' holds the entry id.)")
try:
result = dict.__getitem__(self, field)
#:TODO: rethink this decision (but it is used for formatting)
#:note: 20080331 changed KeyError to return '' instead of None
except KeyError:
crossref = self.get('crossref', '')
if isinstance(crossref, self.__class__):
result = crossref[field]
else:
result = ''
#:note: 20080331 add handling of month macros
if field == 'month' and result in monthmacros_en:
result = MONTH_DICT[result]
return result
def __delitem__(self,key) :
key = key.lower()
try:
dict.__delitem__(self, key)
except KeyError:
pass
try:
self._fields.remove(key)
except ValueError:
pass
[docs] def set_entry_type(self, val):
self["entry_type"] = val.lower() #:note: entry_type stored as lowercase
[docs] def get_entry_type(self):
return self["entry_type"]
entry_type = property(get_entry_type, set_entry_type, None, "property: 'entry_type'")
[docs] def set_citekey(self, val):
self["citekey"] = val
[docs] def get_citekey(self):
return self["citekey"]
citekey = property(get_citekey,set_citekey,None,"property: 'citekey'")
[docs] def get_fields(self):
return self._fields
[docs] def set_fields(self, lst):
self._fields = lst
fields = property(get_fields, set_fields, None, "property: 'fields'")
[docs] def search_fields(self, string_or_compiled, field='', ignore_case=True):
"""Find regular expression in entry.
Return MatchObject if string_or_compiled found in entry else None. If
field is omitted, search is through all fields.
:note: used by BibFile's find_re method, which is used in turn by bibsearch.py
:Parameters:
`string_or_compiled` : string to compile or compiled regex
pattern for searching
`field` : string
field to search in self (default: search all fields)
"""
if isinstance(string_or_compiled, str):
if ignore_case:
reo = re.compile(string_or_compiled, re.MULTILINE | re.IGNORECASE)
else:
reo = re.compile(string_or_compiled, re.MULTILINE)
else: #must have a compiled regular expression
reo = string_or_compiled
if not field: #->try all fields (but not citekey)
for f in self.get_fields():
found = reo.search( self[f] )
if found: break # no need to check more fields
# :note: CAN test 'field in self' (even though an entry will not raise
#KeyError! see TODO above) BUT do not test 'field in self' bc want test
#for empty fields below
elif self[field]:
found = reo.search( self[field] )
else:
if field in self:
bibfile_logger.info("Empty field %s in entry\n%s.\n."%(self,field))
found = None
return found
[docs] def format_names(self, names_formatter):
"""return formatted BibName-object if possible else raw name
:type `names_formatter`: NamesFormatter
:note: called by CitationManager in format_citation
:note: 2006-08-08 no longer sets a `_names` attribute
:TODO: add default name_template useful for .bib files?
"""
bibfile_logger.debug("BibEntry.format_names: arg is:"+str(names_formatter))
names = self.get_names() #get a BibName instance (or possibly, a string)
#keep string if stuck with it
if isinstance(names,str):
result = names
else: #assume a BibName instance
#ask BibName instance to format itself (and it asks a NamesFormatter to do it)
result = names.format(names_formatter)
bibfile_logger.debug("BibEntry.format_names result = "+str(result))
return result
[docs] def get_names(self, entry_formatter=None, try_fields=None):
"""return (BibName-object if possible else string)
:note: 2006-08-09 matching change to `make_names`, no longer sets `self._names`
"""
if entry_formatter is None:
if not try_fields:
try_fields = ['author','editor','organization']
return self.make_names(entry_formatter, try_fields=try_fields)
[docs] def make_names(self, entry_formatter=None, try_fields=None):
"""return (BibName-object if possible else string)
(from "raw" names).
:change: 2006-08-02 altered to return BibName instance and not set _names
:note: self returns None if field missing (-> no KeyError)
:note: this method introduces the only dependence on simpleparse (via bibname)
:TODO: return BibName instance for each available name field??
:Parameters:
- `entry_formatter`: EntryFormatter instance to provide style information
- `try_fields`: list of field names to try sequentially; none empty filed -> name
"""
# importing bibname here to avoid recursive import
from bibstuff import bibname #ai: shd move all bibname into here? possibly
if entry_formatter is None:
for field in try_fields:
raw_names = self[field]
if raw_names:
break
else:
raw_names, field = entry_formatter.pick_raw_names(self,try_fields)
return bibname.BibName(raw_names,from_field=field) #names are in a BibName object
[docs] def format_with(self, entry_formatter):
bibfile_logger.debug("BibEntry.format_with: arg is:"+str(entry_formatter))
#ask the EntryFormatter to do it
return entry_formatter.format_entry(self)
# A default label style for citekeys created by make_citekey()
# first max_names names included, then etal
citekey_label_style1 = dict(
name_template = 'v{_}_|l{}', # "van_der_Meer" or "van_DerStadt"
max_names = 2,
name_name_sep = '+',
etal = 'etal',
anonymous = 'anon',
lower_name = False,
article = "%(names)s-%(year)s",
book = "%(names)s-%(year)s",
misc = "%(names)s-%(year)s",
default_type = "%(names)s-%(year)s",
)
#style2 shd be rst compatible
# citekey_label_style2 = dict(
# name_first = 'l{_}',
# name_other = 'l{_}',
# max_names = 2,
# use_max_names = False,
# name_name_sep = ('.','.'),
# etal = '',
# lower_name = True,
# anonymous = 'anon',
# article = "%(names)s-%(year)s-%(jrnl)s",
# book = "%(names)s-%(year)s",
# misc = "%(names)s-%(year)s",
# default_type = "%(names)s-%(year)s",
# )
[docs] def make_citekey(self, used_citekeys = [], style = citekey_label_style1):
"""Create and return a new citekey based on the entry's data. This is for
creating predictable and useful citekey (labels) for BibEntry objects.
This is not integrated with the citation styles in bibstuff.bibstyles;
but it serves a very different purpose. This is to create consistent
citation keys that are easy to type and guess and that are valid BibTeX
citation keys.
:Parameters:
- used_citekeys : list
a list of the already taken citation keys
so that the function can avoid duplicates (by adding a,b,c,d... etc)
- style : str
The format of the citetekey is determined by a `label_style` (see below)
:Returns: string
the citation key (label)
Example:
The label style is a dict with the following fields::
citekey_label_style1 = dict(
name_template = 'v{_}_|l{}', # see NameFormatter class
max_names = 2,
name_name_sep = "+",
etal = 'etal',
anonymous = 'anon',
lower_name = False,
article = "%(names)s-%(year)s",
book = "%(names)s-%(year)s",
misc = "%(names)s-%(year)s",
default_type = "%(names)s-%(year)s")
:TODO: Strip LaTeX accent characters from names when making label
"""
from .bibstyles.shared import NameFormatter
from string import ascii_lowercase
format_dict = {}
entry_type = self.entry_type.lower()
try:
label_template = style[entry_type]
except KeyError:
label_template = style['default_type']
name_template = style['name_template']
max_names = style['max_names']
name_name_sep = style['name_name_sep']
lower_name = style['lower_name']
etal = style['etal']
# first, make names
name_formatter = NameFormatter(template = name_template)
names_dicts = self.get_names().get_names_dicts()
# make list of 'v_|l' last names, which can possibly have multiple
# tokens (e.g., two piece last names)
ls = [name_formatter.format_name(name_dict) for name_dict in names_dicts]
if len(ls) > max_names:
ls = ls[:max_names] + [etal]
names = name_name_sep.join(ls)
if lower_name:
names = names.lower()
format_dict['names'] = names
year = self['year'] or '????'
format_dict['year'] = year
if entry_type == "article":
jrnl = self['journal']
jrnl = ''.join(jrnl.split()).lower() # keep macro
jrnl = jrnl.replace("journal","j",1)
format_dict['jrnl'] = jrnl # short form, no spaces
# make unique result: if needed, append suffix b or c or d... to year
sfx = ''; c = 1
# while result+sfx in used_citekeys:
while label_template%format_dict in used_citekeys:
sfx = ascii_lowercase[c%26]*(1+c//26) # :note: lowercase since
# BibTeX does not
# distinguish case
format_dict['year'] = year+sfx
c += 1
result = label_template%format_dict
return result
# ----------------------------------------------------------
# Bibfile
# -------
# Data storage for bibtex file
# ----------------------------------------------------------
[docs]class BibFile( DispatchProcessor ):
"""Stores parsed bibtex file. Access entries by key.
:note: a BibFile object should simply *store* .bib file parts
(a list of entries and a macro map) and provide access
to these parts
"""
def __init__(self) :
self.entries = []
self._macroMap = {}
[docs] def get_entrylist(self, citekeys, discard=True):
"""Return list, the BibEntry instances that were found
(and None for entries not found, unless discarded).
"""
if not citekeys:
bibfile_logger.warning("get_entrylist: No keys provided; returning empty cited-entry list.")
return []
temp = [ (key,self.get_entry_by_citekey(key)) for key in citekeys ]
bad_keys = [pair[0] for pair in temp if not pair[1]]
if bad_keys and discard:
bibfile_logger.warning("Database entries not found for the following keys:\n"+"\n".join(bad_keys))
if discard:
result = [pair[1] for pair in temp if pair[1]]
else: #keep None when occurs in entry list
result = [pair[1] for pair in temp]
#attach cross references
for entry in result:
if entry:
crossref = entry.get('crossref', None)
if isinstance(crossref, str):
crossref = self.get_entry_by_citekey(crossref)
if crossref:
entry['crossref'] = crossref
return result
[docs] def get_entry_by_citekey(self, citekey):
"""Return entry or None."""
for entry in self.entries:
if entry.citekey == citekey:
return entry
"""PRODUCTION FUNCTIONS:
for parsing, must provide a function for each production name.
"""
[docs] def string(self, (tag,start,stop,subtags), buffer ):
"""Return a string, stripping leading and trailing markers"""
return buffer[start+1:stop-1]
[docs] def number(self, (tag,start,stop,subtags), buffer ):
"""return a number as a string"""
return buffer[start:stop]
[docs] def entry_type( self, (tag,start,stop,subtags), buffer ):
"""Return the entry type"""
return getString((tag,start,stop,subtags), buffer)
[docs] def citekey( self, (tag,start,stop,subtags), buffer ):
"""Return the entry's citekey"""
return getString((tag,start,stop,subtags), buffer)
# macro name
[docs] def name(self, (tag,start,stop,subtags), buffer ):
"""Return lookup on name or name if not in map."""
return self._macroMap.get(buffer[start:stop],buffer[start:stop])
[docs] def field(self, (tag,start,stop,subtags), buffer ):
"""Process a bibentry field and return tuple of name, value."""
str = ''
for t in subtags[1][3]:
if(t) :
str += dispatch(self, t, buffer) # concatenate hashed together strings
return (dispatch(self, subtags[0], buffer), str)
[docs] def entry( self, (tag,start,stop,subtags), buffer ):
"""Process the bibentry and its children.
"""
entry = BibEntry()
entry.entry_type = dispatch(self, subtags[0], buffer)
entry.citekey = dispatch(self, subtags[1], buffer)
for field in subtags[2][3] :
#bibfile_logger.debug("entry: ready to add field: "+str(dispatch(self, field, buffer)))
k,v = dispatch(self, field, buffer)
#:note: entry will force k to lowercase
entry[k] = v
self.entries.append(entry)
[docs] def macro( self, (tag,start,stop,subtags), buffer ):
"""Process a macro entry and add macros to macro map"""
name, str = dispatch(self, subtags[0], buffer)
"""
the_type = getString(subtags[0], buffer)
if the_type.upper() != 'STRING' :
# it looks like a macro, but is not: could be a regular entry with no key
lineno = lines(0, start, buffer)+1
bibfile_logger.warning("Entry at line %d has macro syntax, but entry_type is %s" % (lineno , the_type))
if not __strict__: # we can add a dummy key and treat this entry as a regular entry
entry = BibEntry()
entry.entry_type = dispatch(self, subtags[0], buffer)
entry.citekey = 'KEY' # dummy key -- or should we be strict?
for field in subtags[1][3] :
k,v = dispatch(self, field, buffer)
#:note: entry will force k to lowercase
entry[k] = v
self.entries.append(entry)
bibfile_logger.warning("Dummy key added to entry at line %d" % lineno)
else : # otherwise it is really a macro entry
for field in subtags[1][3]:
name, str = dispatch(self, field, buffer)
self._macroMap[name] = str
"""
self._macroMap[name] = str
[docs] def preamble( self, (tag,start,stop,subtags), buffer ):
"""Process the given production and it's children"""
the_type = getString(subtags[0], buffer)
lineno = lines(0,start,buffer)+1
if the_type.upper() != 'PREAMBLE' :
bibfile_logger.warning("Entry at line %d has preamble syntax but entry_type is %s" % (lineno,the_type))
else :
bibfile_logger.warning("Preamble entry on line %d:" % lineno + "\n" + buffer[start:stop])
[docs] def search_entries(self, string_or_compiled, field='', ignore_case=True):
"""Return list of matching entries.
Search for regular expression in the fields of each entry.
If field is omitted, search is through all fields.
:note: used by bibsearch.py
:Parameters:
- `string_or_compiled` : string to compile or compiled regex
pattern for searching
- `field` : string
field to search in self (default: search all fields)
"""
if isinstance(string_or_compiled, str):
if ignore_case:
reo = re.compile(string_or_compiled, re.MULTILINE | re.IGNORECASE)
else:
reo = re.compile(string_or_compiled, re.MULTILINE)
else: #->must have a compiled regular expression
reo = string_or_compiled
"""
Find regex in bib_entry.
If field is omitted, search is through all fields.
:note: used by bibsearch.py
"""
ls = [entry for entry in self.entries
if entry.search_fields(string_or_compiled=reo, field=field, ignore_case=ignore_case)]
return ls
# self test
# -------------------------
# usage: bibfile.py DATABASE_FILE
# if __name__ == "__main__":
# import sys
# if len(sys.argv) > 1 :
# src = open(sys.argv[1]).read()
# bfile = BibFile()
# bibgrammar.Parse(src, bfile)
# for entry in bfile.entries :
# print entry
# else :
# print "self test usage: bibfile.py DATABASE_FILE"