summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Sotoudeh <masotoudeh@ucdavis.edu>2020-09-16 13:25:47 -0700
committerGitHub <noreply@github.com>2020-09-16 13:25:47 -0700
commit5d083c8ff8794c598f1e31c24d7f74f36b86cae9 (patch)
tree7dd901089aae430fa7c36bbbae61ec5e973da3bb
parent696252c38a939599cc2f0ea8a05b135120e725d3 (diff)
Add a script for automatically DBLP-ifying references (#5)
Also added documentation for it in the README.
-rw-r--r--.gitignore1
-rw-r--r--BUILD.bazel12
-rw-r--r--README.md11
-rw-r--r--dblp.py90
-rw-r--r--dblpify.py79
5 files changed, 193 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index b545abe..0b56340 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
.*.swp
+__pycache__
.nfs*
/bazel-*
diff --git a/BUILD.bazel b/BUILD.bazel
index 58074f4..5d909b4 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -5,3 +5,15 @@ exports_files([
"get_arxivable.sh",
"pdfcrop.sh",
])
+
+py_binary(
+ name = "dblpify",
+ srcs = ["dblpify.py"],
+ deps = [":dblp"],
+ visibility = ["//visibility:public"],
+)
+
+py_library(
+ name = "dblp",
+ srcs = ["dblp.py"],
+)
diff --git a/README.md b/README.md
index b0ddb92..f5051d7 100644
--- a/README.md
+++ b/README.md
@@ -88,6 +88,17 @@ Every `latex_document` rule creates multiple targets:
* `bazel run [name]_getarxivable` will copy the arXiv-ready version of the
source into the current directory.
+Additionally, a `dblpify` script is provided to interactively replace BibTeX
+entries with standardized DBLP ones. It can be run on a file `main.bib` like
+so:
+```bash
+bazel run @bazel_latex//:dblpify -- main.bib
+```
+Producing an output file `main.dblp.bib`. Note that this script assumes you
+have installed in your system the following Python packages: `bibtexparser`,
+`pandas`, `requests`, and `beautifulsoup`. Our script is based on the wonderful
+[dblp-pub](https://github.com/sebastianGehrmann/dblp-pub) library.
+
## Goals
These rules are designed to achieve the following goals:
diff --git a/dblp.py b/dblp.py
new file mode 100644
index 0000000..d6dc859
--- /dev/null
+++ b/dblp.py
@@ -0,0 +1,90 @@
+"""DBLP web scraper.
+
+COPIED UNDER MIT LICENSE
+https://github.com/sebastianGehrmann/dblp-pub
+
+Slightly modified to support the 'Id' property.
+"""
+from bs4 import BeautifulSoup
+import pandas as pd
+import requests
+
+#options
+STRINGS_FOR_TEST = ["Collaborative Writing"]
+DBLP_BASE_URL = 'http://dblp.uni-trier.de/'
+PUB_SEARCH_URL = DBLP_BASE_URL + "search/publ/"
+
+
+def query_db(pub_string=STRINGS_FOR_TEST):
+ '''
+ returns the BeautifulSoup object of a query to DBLP
+ :param pub_string: A list of strings of keywords
+ :return: BeautifulSoup: A BeautifulSoup Object
+ '''
+ resp = requests.get(PUB_SEARCH_URL, params={'q':pub_string})
+ return BeautifulSoup(resp.content, features="lxml")
+
+def get_pub_data(pub):
+ '''
+ Extracts the information about a publication from a BeautifulSoup object
+ :param pub: A BeautifulSoup Object with Publication Information
+ :return: dict: All Information of this Publication
+ '''
+ ptype = 'nothing'
+ link = 'nothing'
+ authors = []
+ title = 'nothing'
+ where = 'nothing'
+
+ if 'year' in pub.get('class'):
+ # year is not always scrapable, except for this case. Might be done more elegantly
+ return int(pub.contents[0])
+ else:
+ ptype = pub.attrs.get('class')[1]
+ for content_item in pub.contents:
+ class_of_content_item = content_item.attrs.get('class', [0])
+ if 'data' in class_of_content_item:
+ for author in content_item.findAll('span', attrs={"itemprop": "author"}):
+ authors.append(author.text)
+ title = content_item.find('span', attrs={"class": "title"}).text
+ for where_data in content_item.findAll('span', attrs={"itemprop": "isPartOf"}):
+ found_where = where_data.find('span', attrs={"itemprop": "name"})
+ if found_where:
+ where = found_where.text
+ elif 'publ' in class_of_content_item:
+ link = content_item.contents[0].find('a').attrs.get('href', "nothing")
+
+ return {'Type': ptype,
+ 'Link': link,
+ 'Authors': authors,
+ 'Title': title,
+ 'Where': where,
+ 'Id': pub.attrs.get('id'),}
+
+def search(search_string=STRINGS_FOR_TEST):
+ '''
+ returns the information found in a search query to dblp as a pandas dataframe.
+ Shows the following information:
+ - Authors
+ - Link to Publication
+ - Title
+ - Type (Article, Proceedings etc.)
+ - Where it was published
+ - Year of publication
+ :param search_string: A List of Strings of Keywords, that should be searched for
+ :return: pd.DataFrame: A Dataframe with all data
+ '''
+ soup = query_db(search_string)
+ pub_list_raw = soup.find("ul", attrs={"class": "publ-list"})
+
+ pub_list_data = []
+ curr_year = 0
+ for child in pub_list_raw.children:
+ pub_data = get_pub_data(child)
+ if type(pub_data) == int:
+ curr_year = pub_data
+ else:
+ pub_data['Year'] = curr_year
+ pub_list_data.append(pub_data)
+
+ return pd.DataFrame(pub_list_data)
diff --git a/dblpify.py b/dblpify.py
new file mode 100644
index 0000000..c33d82b
--- /dev/null
+++ b/dblpify.py
@@ -0,0 +1,79 @@
+"""Helper script to automatically standardize a BibTeX database.
+"""
+import os
+import sys
+import urllib
+import bibtexparser
+import dblp
+
+def main(bib_file):
+ """Standardize the file given by @bib_file relative to working directory.
+ """
+ bib_file = os.environ["BUILD_WORKING_DIRECTORY"] + "/" + bib_file
+ with open(bib_file, "r") as bibtex_file:
+ bib_database = bibtexparser.load(bibtex_file)
+
+ # [name].bib -> [name].dblp.bib
+ out_file = open(bib_file[:-4] + ".dblp.bib", "w")
+
+ for i, entry in enumerate(bib_database.entries):
+ print("Entry:", i + 1, "/", len(bib_database.entries))
+ print("Title:", entry.get("title", "[None]"))
+ print("Author:", entry.get("author", "[None]"))
+ bibtex_id = entry["ID"]
+ results = search_dblp(entry["title"])
+
+ if not results.empty:
+ result = select_row(results)
+
+ if results.empty or result is None:
+ out_file.write("\n% COULD NOT FIND " + entry["ID"]
+ + ": " + entry["title"]
+ + " by " + entry["author"] + "} \n")
+ continue
+
+ bibtex_url = f"https://dblp.uni-trier.de/rec/{result.Id}.bib?param=1"
+ bibtex_entry = urllib.request.urlopen(bibtex_url).read().decode("utf-8")
+ bibtex_entry = set_id(bibtex_entry, bibtex_id)
+ print(bibtex_entry)
+ out_file.write(bibtex_entry)
+ out_file.close()
+
+def search_dblp(title):
+ """Given a paper title, attempt to search for it on DBLP.
+
+ If @title does not match anything, we iteratively loosen our search
+ constraints by dropping the last word of the title until results are found.
+ """
+ title = title.replace("{", "").replace("}", "")
+ results = dblp.search([title])
+ if results.empty:
+ return search_dblp(" ".join(title.split(" ")[:-1]))
+ return results
+
+def set_id(bibtex_entry, ID):
+ """Given a (string) BibTeX entry, replace its identifier with @ID.
+ """
+ first_bracket = bibtex_entry.index("{") + 1
+ first_comma = bibtex_entry.index(",")
+ return bibtex_entry[:first_bracket] + ID + bibtex_entry[first_comma:]
+
+def select_row(results):
+ """Given a DataFrame of DBLP entries, prompt the user to select one.
+
+ If the user enters -1, this function will return None.
+ """
+ print(results[["Type", "Title", "Authors", "Where"]])
+ try:
+ row = int(input("Select a row (or -1 for none): "))
+ if 0 <= row < len(results):
+ result = results.iloc[row]
+ return result
+ if row == -1:
+ return None
+ except ValueError:
+ pass
+ return select_row(results)
+
+if __name__ == "__main__":
+ main(sys.argv[1])
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback