#!/usr/bin/env python # # BIBFAST # Copyright (C) 2006 Mike Nolta # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # TODO: # * handle multiline \cite's # * make sure changing opt ordering doesn't affect cache lookup # import pickle import urllib BIBFAST_VERSION = "0.1" def _get_text( node ): return node.childNodes[0].data class ADS: def __init__( self ): self._cache = {} def _parse_record( self, record ): x = {} x['bibcode'] = _get_text( record.getElementsByTagName('bibcode')[0] ) x['title'] = _get_text( record.getElementsByTagName('title')[0] ) x['journal'] = _get_text( record.getElementsByTagName('journal')[0] ) authors = [] for e in record.getElementsByTagName('author'): author = _get_text(e) authors.append( author ) x['authors'] = authors return x def search_for_bibcode( self, authors, user_opts ): import textwrap from xml.dom.minidom import parse allow_any_number_authors = False named_authors = [x for x in authors if x != '...' and x != 'etal'] if len(named_authors) != len(authors): allow_any_number_authors = True author_query = ";".join(named_authors) # Adding '^' ensures that authors[0] is the first author if authors[0] == named_authors[0]: author_query = "^" + author_query base_url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect" opts = { 'author' : author_query, 'aut_logic' : 'AND', 'aut_req' : 'YES', 'data_type' : 'SHORT_XML', 'db_key' : 'AST', } opts.update( user_opts ) url = base_url + "?" + urllib.urlencode(opts) query_s = ";".join(authors) if 'start_year' in opts and 'end_year' in opts: if opts['start_year'] == opts['end_year']: query_s += " (%(start_year)d)" % opts else: query_s += " (%(start_year)d-%(end_year)d)" % opts print url print "searching ADS %s for %s" % (opts['db_key'],query_s) if url in self._cache: print '(retrieving cached result)' doc = self._cache[url] else: u = urllib.urlopen( url ) doc = parse( u ) u.close() self._cache[url] = doc records = doc.getElementsByTagName('record') if len(records) == 0: print return None good_records = [] for record in records: info = self._parse_record( record ) if allow_any_number_authors or \ len(named_authors) == len(info['authors']): good_records.append( info ) if len(good_records) == 0: print "didn't find it" return None elif len(good_records) == 1: print "found it!" else: print "found %d possibilities" % len(good_records) ## print record info for i in range(len(good_records)): info = good_records[i] print tw = textwrap.TextWrapper( width=72 ) tw.initial_indent = "[%d] " % (i+1) #tw.subsequent_indent = " " tw.subsequent_indent = " " print tw.fill( '"%s"' % info['title'] ) tw.initial_indent = " " #tw.subsequent_indent = " " print tw.fill( "; ".join(info['authors']) ) print tw.fill( info['journal'] ) print if len(good_records) == 1: return good_records[0]['bibcode'] while 1: try: index = int(raw_input( "Choose (0 for None): " )) if index >= 0 and index <= len(good_records): break else: print "*** out of bounds" except ValueError: print "*** not an integer" if index == 0: return None else: return good_records[index-1]['bibcode'] def get_bibtex( self, bibcode ): base_url = "http://adsabs.harvard.edu/cgi-bin/nph-bib_query" opts = [ ( 'bibcode' , bibcode ), ( 'data_type' , 'BIBTEX' ), ] url = base_url + "?" + urllib.urlencode(opts) u = urllib.urlopen( url ) lines = u.readlines() u.close() if lines[0] != "Query Results from the ADS Database\n": return None else: return "".join( lines[5:-1] ) ads = ADS() def parse_citekey( citekey ): try: x = {} s = citekey.split(':') authors = [] for author in s[0].split('/'): names = author.split('+') if len(names) == 2: authors.append( "%s,%s" % (names[1],names[0]) ) else: authors.append( author ) x['authors'] = authors x['year'] = s[1] if len(s) > 2: x['title'] = s[2].replace('+',' ') return x except: return None class Bibfast: def __init__( self, filename ): self.filename = filename def run( self ): import re print "Bibfast v%s" % BIBFAST_VERSION print "Looking up citations from %s in ADS..." % self.filename self.load_cache() has_cite = re.compile( r"\\cite[tp]?{" ) pattern = re.compile( r"\\cite[tp]?{([^}]+)}" ) src = open( self.filename ) for line in src: m = has_cite.search( line ) if m is None: continue else: for m in pattern.finditer(line): keys = m.group(1).split(',') for key in keys: key = key.strip() self.process_key( key ) self.dump_cache() print print "Done looking up citations" def process_key( self, key ): print "\n--- cite key %s ---" % key if key in self.bibcode_db: print "key already in database, done" return x = parse_citekey( key ) if x is None: print "don't understand key, ignoring it" return query = {} if x['year'] == 'prep': import datetime query['db_key'] = 'PRE' today = datetime.date.today() oneyear = datetime.timedelta( weeks=52 ) ayearago = today - oneyear query['start_year'] = ayearago.year query['start_mon'] = ayearago.month query['end_year'] = today.year else: query['db_key'] = 'AST' query['start_year'] = int(x['year']) query['end_year'] = int(x['year']) if 'title' in x: query['title'] = x['title'] query['ttl_req'] = 'YES' bibcode = ads.search_for_bibcode( x['authors'], query ) if bibcode is None: print "bibcode not found" return self.bibcode_db[key] = bibcode if bibcode not in self.bibtex_db: print "retrieving bibtex from ADS" bibtex = ads.get_bibtex( bibcode ) self.bibtex_db[bibcode] = bibtex else: print "retrieving bibtex from cache" def load_cache( self ): self.fn_bibtex_db = "." + self.filename + ".bibtex_db" self.fn_bibcode_db = "." + self.filename + ".bibcode_db" try: self.bibtex_db = pickle.load( open(self.fn_bibtex_db) ) except: self.bibtex_db = {} try: self.bibcode_db = pickle.load( open(self.fn_bibcode_db) ) except: self.bibcode_db = {} def dump_cache( self ): pickle.dump( self.bibtex_db, open(self.fn_bibtex_db,'w') ) pickle.dump( self.bibcode_db, open(self.fn_bibcode_db,'w') ) def write_bibtex( self, filename=None ): import os, re if filename is None: ofn = self.filename.rsplit('.',1)[0] + '-bibfast.bib' else: ofn = filename print "Writing bibtex file %s" % ofn # if filename is None and os.path.exists(ofn): # prompt = "%s exists, overwrite? [y/N]> " % ofn # yesno = raw_input( prompt ) # if yesno != 'y': # print "not overwriting" # return # print "overwriting file" f = open( ofn, 'w' ) f.write( "@comment{Generated by bibfast}\n" ) rekey = re.compile( r'^@(\S+){(\S+),', re.M ) for citekey in self.bibcode_db.iterkeys(): bibcode = self.bibcode_db[citekey] bibtex = self.bibtex_db[bibcode] b = rekey.sub( r"@\1{%s," % citekey, bibtex, 1 ) f.write( b ) f.close() if __name__ == '__main__': import sys filename = sys.argv[1] b = Bibfast( filename ) b.run() if len(sys.argv) > 2: b.write_bibtex( sys.argv[2] ) else: b.write_bibtex()