From 5d083c8ff8794c598f1e31c24d7f74f36b86cae9 Mon Sep 17 00:00:00 2001 From: Matthew Sotoudeh Date: Wed, 16 Sep 2020 13:25:47 -0700 Subject: Add a script for automatically DBLP-ifying references (#5) Also added documentation for it in the README. --- .gitignore | 1 + BUILD.bazel | 12 +++++++++ README.md | 11 ++++++++ dblp.py | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ dblpify.py | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 193 insertions(+) create mode 100644 dblp.py create mode 100644 dblpify.py diff --git a/.gitignore b/.gitignore index b545abe..0b56340 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .*.swp +__pycache__ .nfs* /bazel-* diff --git a/BUILD.bazel b/BUILD.bazel index 58074f4..5d909b4 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -5,3 +5,15 @@ exports_files([ "get_arxivable.sh", "pdfcrop.sh", ]) + +py_binary( + name = "dblpify", + srcs = ["dblpify.py"], + deps = [":dblp"], + visibility = ["//visibility:public"], +) + +py_library( + name = "dblp", + srcs = ["dblp.py"], +) diff --git a/README.md b/README.md index b0ddb92..f5051d7 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,17 @@ Every `latex_document` rule creates multiple targets: * `bazel run [name]_getarxivable` will copy the arXiv-ready version of the source into the current directory. +Additionally, a `dblpify` script is provided to interactively replace BibTeX +entries with standardized DBLP ones. It can be run on a file `main.bib` like +so: +```bash +bazel run @bazel_latex//:dblpify -- main.bib +``` +Producing an output file `main.dblp.bib`. Note that this script assumes you +have installed in your system the following Python packages: `bibtexparser`, +`pandas`, `requests`, and `beautifulsoup`. Our script is based on the wonderful +[dblp-pub](https://github.com/sebastianGehrmann/dblp-pub) library. + ## Goals These rules are designed to achieve the following goals: diff --git a/dblp.py b/dblp.py new file mode 100644 index 0000000..d6dc859 --- /dev/null +++ b/dblp.py @@ -0,0 +1,90 @@ +"""DBLP web scraper. + +COPIED UNDER MIT LICENSE +https://github.com/sebastianGehrmann/dblp-pub + +Slightly modified to support the 'Id' property. +""" +from bs4 import BeautifulSoup +import pandas as pd +import requests + +#options +STRINGS_FOR_TEST = ["Collaborative Writing"] +DBLP_BASE_URL = 'http://dblp.uni-trier.de/' +PUB_SEARCH_URL = DBLP_BASE_URL + "search/publ/" + + +def query_db(pub_string=STRINGS_FOR_TEST): + ''' + returns the BeautifulSoup object of a query to DBLP + :param pub_string: A list of strings of keywords + :return: BeautifulSoup: A BeautifulSoup Object + ''' + resp = requests.get(PUB_SEARCH_URL, params={'q':pub_string}) + return BeautifulSoup(resp.content, features="lxml") + +def get_pub_data(pub): + ''' + Extracts the information about a publication from a BeautifulSoup object + :param pub: A BeautifulSoup Object with Publication Information + :return: dict: All Information of this Publication + ''' + ptype = 'nothing' + link = 'nothing' + authors = [] + title = 'nothing' + where = 'nothing' + + if 'year' in pub.get('class'): + # year is not always scrapable, except for this case. Might be done more elegantly + return int(pub.contents[0]) + else: + ptype = pub.attrs.get('class')[1] + for content_item in pub.contents: + class_of_content_item = content_item.attrs.get('class', [0]) + if 'data' in class_of_content_item: + for author in content_item.findAll('span', attrs={"itemprop": "author"}): + authors.append(author.text) + title = content_item.find('span', attrs={"class": "title"}).text + for where_data in content_item.findAll('span', attrs={"itemprop": "isPartOf"}): + found_where = where_data.find('span', attrs={"itemprop": "name"}) + if found_where: + where = found_where.text + elif 'publ' in class_of_content_item: + link = content_item.contents[0].find('a').attrs.get('href', "nothing") + + return {'Type': ptype, + 'Link': link, + 'Authors': authors, + 'Title': title, + 'Where': where, + 'Id': pub.attrs.get('id'),} + +def search(search_string=STRINGS_FOR_TEST): + ''' + returns the information found in a search query to dblp as a pandas dataframe. + Shows the following information: + - Authors + - Link to Publication + - Title + - Type (Article, Proceedings etc.) + - Where it was published + - Year of publication + :param search_string: A List of Strings of Keywords, that should be searched for + :return: pd.DataFrame: A Dataframe with all data + ''' + soup = query_db(search_string) + pub_list_raw = soup.find("ul", attrs={"class": "publ-list"}) + + pub_list_data = [] + curr_year = 0 + for child in pub_list_raw.children: + pub_data = get_pub_data(child) + if type(pub_data) == int: + curr_year = pub_data + else: + pub_data['Year'] = curr_year + pub_list_data.append(pub_data) + + return pd.DataFrame(pub_list_data) diff --git a/dblpify.py b/dblpify.py new file mode 100644 index 0000000..c33d82b --- /dev/null +++ b/dblpify.py @@ -0,0 +1,79 @@ +"""Helper script to automatically standardize a BibTeX database. +""" +import os +import sys +import urllib +import bibtexparser +import dblp + +def main(bib_file): + """Standardize the file given by @bib_file relative to working directory. + """ + bib_file = os.environ["BUILD_WORKING_DIRECTORY"] + "/" + bib_file + with open(bib_file, "r") as bibtex_file: + bib_database = bibtexparser.load(bibtex_file) + + # [name].bib -> [name].dblp.bib + out_file = open(bib_file[:-4] + ".dblp.bib", "w") + + for i, entry in enumerate(bib_database.entries): + print("Entry:", i + 1, "/", len(bib_database.entries)) + print("Title:", entry.get("title", "[None]")) + print("Author:", entry.get("author", "[None]")) + bibtex_id = entry["ID"] + results = search_dblp(entry["title"]) + + if not results.empty: + result = select_row(results) + + if results.empty or result is None: + out_file.write("\n% COULD NOT FIND " + entry["ID"] + + ": " + entry["title"] + + " by " + entry["author"] + "} \n") + continue + + bibtex_url = f"https://dblp.uni-trier.de/rec/{result.Id}.bib?param=1" + bibtex_entry = urllib.request.urlopen(bibtex_url).read().decode("utf-8") + bibtex_entry = set_id(bibtex_entry, bibtex_id) + print(bibtex_entry) + out_file.write(bibtex_entry) + out_file.close() + +def search_dblp(title): + """Given a paper title, attempt to search for it on DBLP. + + If @title does not match anything, we iteratively loosen our search + constraints by dropping the last word of the title until results are found. + """ + title = title.replace("{", "").replace("}", "") + results = dblp.search([title]) + if results.empty: + return search_dblp(" ".join(title.split(" ")[:-1])) + return results + +def set_id(bibtex_entry, ID): + """Given a (string) BibTeX entry, replace its identifier with @ID. + """ + first_bracket = bibtex_entry.index("{") + 1 + first_comma = bibtex_entry.index(",") + return bibtex_entry[:first_bracket] + ID + bibtex_entry[first_comma:] + +def select_row(results): + """Given a DataFrame of DBLP entries, prompt the user to select one. + + If the user enters -1, this function will return None. + """ + print(results[["Type", "Title", "Authors", "Where"]]) + try: + row = int(input("Select a row (or -1 for none): ")) + if 0 <= row < len(results): + result = results.iloc[row] + return result + if row == -1: + return None + except ValueError: + pass + return select_row(results) + +if __name__ == "__main__": + main(sys.argv[1]) -- cgit v1.2.3