Add a script for automatically DBLP-ifying references (#5)

Also added documentation for it in the README.
author: Matthew Sotoudeh <masotoudeh@ucdavis.edu> 2020-09-16 13:25:47 -0700
committer: GitHub <noreply@github.com> 2020-09-16 13:25:47 -0700
commit: 5d083c8ff8794c598f1e31c24d7f74f36b86cae9 (patch)
tree: 7dd901089aae430fa7c36bbbae61ec5e973da3bb
parent: 696252c38a939599cc2f0ea8a05b135120e725d3 (diff)
5 files changed, 193 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index b545abe..0b56340 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 .*.swp
+__pycache__
 .nfs*
 /bazel-*
diff --git a/BUILD.bazel b/BUILD.bazel
index 58074f4..5d909b4 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -5,3 +5,15 @@ exports_files([
     "get_arxivable.sh",
     "pdfcrop.sh",
 ])
+
+py_binary(
+    name = "dblpify",
+    srcs = ["dblpify.py"],
+    deps = [":dblp"],
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "dblp",
+    srcs = ["dblp.py"],
+)
diff --git a/README.md b/README.md
index b0ddb92..f5051d7 100644
--- a/README.md
+++ b/README.md
@@ -88,6 +88,17 @@ Every `latex_document` rule creates multiple targets:
 * `bazel run [name]_getarxivable` will copy the arXiv-ready version of the
   source into the current directory.
 
+Additionally, a `dblpify` script is provided to interactively replace BibTeX
+entries with standardized DBLP ones. It can be run on a file `main.bib` like
+so:
+```bash
+bazel run @bazel_latex//:dblpify -- main.bib
+```
+Producing an output file `main.dblp.bib`. Note that this script assumes you
+have installed in your system the following Python packages: `bibtexparser`,
+`pandas`, `requests`, and `beautifulsoup`. Our script is based on the wonderful
+[dblp-pub](https://github.com/sebastianGehrmann/dblp-pub) library.
+
 ## Goals
 These rules are designed to achieve the following goals:
 
diff --git a/dblp.py b/dblp.py
new file mode 100644
index 0000000..d6dc859
--- /dev/null
+++ b/dblp.py
@@ -0,0 +1,90 @@
+"""DBLP web scraper.
+
+COPIED UNDER MIT LICENSE
+https://github.com/sebastianGehrmann/dblp-pub
+
+Slightly modified to support the 'Id' property.
+"""
+from bs4 import BeautifulSoup
+import pandas as pd
+import requests
+
+#options
+STRINGS_FOR_TEST = ["Collaborative Writing"]
+DBLP_BASE_URL = 'http://dblp.uni-trier.de/'
+PUB_SEARCH_URL = DBLP_BASE_URL + "search/publ/"
+
+
+def query_db(pub_string=STRINGS_FOR_TEST):
+    '''
+    returns the BeautifulSoup object of a query to DBLP
+    :param pub_string: A list of strings of keywords
+    :return: BeautifulSoup: A BeautifulSoup Object
+    '''
+    resp = requests.get(PUB_SEARCH_URL, params={'q':pub_string})
+    return BeautifulSoup(resp.content, features="lxml")
+
+def get_pub_data(pub):
+    '''
+    Extracts the information about a publication from a BeautifulSoup object
+    :param pub: A BeautifulSoup Object with Publication Information
+    :return: dict: All Information of this Publication
+    '''
+    ptype = 'nothing'
+    link = 'nothing'
+    authors = []
+    title = 'nothing'
+    where = 'nothing'
+
+    if 'year' in pub.get('class'):
+        # year is not always scrapable, except for this case. Might be done more elegantly
+        return int(pub.contents[0])
+    else:
+        ptype = pub.attrs.get('class')[1]
+        for content_item in pub.contents:
+            class_of_content_item = content_item.attrs.get('class', [0])
+            if 'data' in class_of_content_item:
+                for author in content_item.findAll('span', attrs={"itemprop": "author"}):
+                    authors.append(author.text)
+                title = content_item.find('span', attrs={"class": "title"}).text
+                for where_data in content_item.findAll('span', attrs={"itemprop": "isPartOf"}):
+                    found_where = where_data.find('span', attrs={"itemprop": "name"})
+                    if found_where:
+                        where = found_where.text
+            elif 'publ' in class_of_content_item:
+                link = content_item.contents[0].find('a').attrs.get('href', "nothing")
+
+    return {'Type': ptype,
+            'Link': link,
+            'Authors': authors,
+            'Title': title,
+            'Where': where,
+            'Id': pub.attrs.get('id'),}
+
+def search(search_string=STRINGS_FOR_TEST):
+    '''
+    returns the information found in a search query to dblp as a pandas dataframe.
+    Shows the following information:
+        - Authors
+        - Link to Publication
+        - Title
+        - Type (Article, Proceedings etc.)
+        - Where it was published
+        - Year of publication
+    :param search_string: A List of Strings of Keywords, that should be searched for
+    :return: pd.DataFrame: A Dataframe with all data
+    '''
+    soup = query_db(search_string)
+    pub_list_raw = soup.find("ul", attrs={"class": "publ-list"})
+
+    pub_list_data = []
+    curr_year = 0
+    for child in pub_list_raw.children:
+        pub_data = get_pub_data(child)
+        if type(pub_data) == int:
+            curr_year = pub_data
+        else:
+            pub_data['Year'] = curr_year
+            pub_list_data.append(pub_data)
+
+    return pd.DataFrame(pub_list_data)
diff --git a/dblpify.py b/dblpify.py
new file mode 100644
index 0000000..c33d82b
--- /dev/null
+++ b/dblpify.py
@@ -0,0 +1,79 @@
+"""Helper script to automatically standardize a BibTeX database.
+"""
+import os
+import sys
+import urllib
+import bibtexparser
+import dblp
+
+def main(bib_file):
+    """Standardize the file given by @bib_file relative to working directory.
+    """
+    bib_file = os.environ["BUILD_WORKING_DIRECTORY"] + "/" + bib_file
+    with open(bib_file, "r") as bibtex_file:
+        bib_database = bibtexparser.load(bibtex_file)
+
+    # [name].bib -> [name].dblp.bib
+    out_file = open(bib_file[:-4] + ".dblp.bib", "w")
+
+    for i, entry in enumerate(bib_database.entries):
+        print("Entry:", i + 1, "/", len(bib_database.entries))
+        print("Title:", entry.get("title", "[None]"))
+        print("Author:", entry.get("author", "[None]"))
+        bibtex_id = entry["ID"]
+        results = search_dblp(entry["title"])
+
+        if not results.empty:
+            result = select_row(results)
+
+        if results.empty or result is None:
+            out_file.write("\n% COULD NOT FIND " + entry["ID"]
+                           + ": " + entry["title"]
+                           + " by " + entry["author"] + "} \n")
+            continue
+
+        bibtex_url = f"https://dblp.uni-trier.de/rec/{result.Id}.bib?param=1"
+        bibtex_entry = urllib.request.urlopen(bibtex_url).read().decode("utf-8")
+        bibtex_entry = set_id(bibtex_entry, bibtex_id)
+        print(bibtex_entry)
+        out_file.write(bibtex_entry)
+    out_file.close()
+
+def search_dblp(title):
+    """Given a paper title, attempt to search for it on DBLP.
+
+    If @title does not match anything, we iteratively loosen our search
+    constraints by dropping the last word of the title until results are found.
+    """
+    title = title.replace("{", "").replace("}", "")
+    results = dblp.search([title])
+    if results.empty:
+        return search_dblp(" ".join(title.split(" ")[:-1]))
+    return results
+
+def set_id(bibtex_entry, ID):
+    """Given a (string) BibTeX entry, replace its identifier with @ID.
+    """
+    first_bracket = bibtex_entry.index("{") + 1
+    first_comma = bibtex_entry.index(",")
+    return bibtex_entry[:first_bracket] + ID + bibtex_entry[first_comma:]
+
+def select_row(results):
+    """Given a DataFrame of DBLP entries, prompt the user to select one.
+
+    If the user enters -1, this function will return None.
+    """
+    print(results[["Type", "Title", "Authors", "Where"]])
+    try:
+        row = int(input("Select a row (or -1 for none): "))
+        if 0 <= row < len(results):
+            result = results.iloc[row]
+            return result
+        if row == -1:
+            return None
+    except ValueError:
+        pass
+    return select_row(results)
+
+if __name__ == "__main__":
+    main(sys.argv[1])
author	Matthew Sotoudeh <masotoudeh@ucdavis.edu>	2020-09-16 13:25:47 -0700
committer	GitHub <noreply@github.com>	2020-09-16 13:25:47 -0700
commit	5d083c8ff8794c598f1e31c24d7f74f36b86cae9 (patch)
tree	7dd901089aae430fa7c36bbbae61ec5e973da3bb
parent	696252c38a939599cc2f0ea8a05b135120e725d3 (diff)