Add a script for automatically DBLP-ifying references (#5)

Also added documentation for it in the README.
author: Matthew Sotoudeh <masotoudeh@ucdavis.edu> 2020-09-16 13:25:47 -0700
committer: GitHub <noreply@github.com> 2020-09-16 13:25:47 -0700
commit: 5d083c8ff8794c598f1e31c24d7f74f36b86cae9 (patch)
tree: 7dd901089aae430fa7c36bbbae61ec5e973da3bb /dblp.py
parent: 696252c38a939599cc2f0ea8a05b135120e725d3 (diff)
1 files changed, 90 insertions, 0 deletions
diff --git a/dblp.py b/dblp.py
new file mode 100644
index 0000000..d6dc859
--- /dev/null
+++ b/dblp.py
@@ -0,0 +1,90 @@
+"""DBLP web scraper.
+
+COPIED UNDER MIT LICENSE
+https://github.com/sebastianGehrmann/dblp-pub
+
+Slightly modified to support the 'Id' property.
+"""
+from bs4 import BeautifulSoup
+import pandas as pd
+import requests
+
+#options
+STRINGS_FOR_TEST = ["Collaborative Writing"]
+DBLP_BASE_URL = 'http://dblp.uni-trier.de/'
+PUB_SEARCH_URL = DBLP_BASE_URL + "search/publ/"
+
+
+def query_db(pub_string=STRINGS_FOR_TEST):
+    '''
+    returns the BeautifulSoup object of a query to DBLP
+    :param pub_string: A list of strings of keywords
+    :return: BeautifulSoup: A BeautifulSoup Object
+    '''
+    resp = requests.get(PUB_SEARCH_URL, params={'q':pub_string})
+    return BeautifulSoup(resp.content, features="lxml")
+
+def get_pub_data(pub):
+    '''
+    Extracts the information about a publication from a BeautifulSoup object
+    :param pub: A BeautifulSoup Object with Publication Information
+    :return: dict: All Information of this Publication
+    '''
+    ptype = 'nothing'
+    link = 'nothing'
+    authors = []
+    title = 'nothing'
+    where = 'nothing'
+
+    if 'year' in pub.get('class'):
+        # year is not always scrapable, except for this case. Might be done more elegantly
+        return int(pub.contents[0])
+    else:
+        ptype = pub.attrs.get('class')[1]
+        for content_item in pub.contents:
+            class_of_content_item = content_item.attrs.get('class', [0])
+            if 'data' in class_of_content_item:
+                for author in content_item.findAll('span', attrs={"itemprop": "author"}):
+                    authors.append(author.text)
+                title = content_item.find('span', attrs={"class": "title"}).text
+                for where_data in content_item.findAll('span', attrs={"itemprop": "isPartOf"}):
+                    found_where = where_data.find('span', attrs={"itemprop": "name"})
+                    if found_where:
+                        where = found_where.text
+            elif 'publ' in class_of_content_item:
+                link = content_item.contents[0].find('a').attrs.get('href', "nothing")
+
+    return {'Type': ptype,
+            'Link': link,
+            'Authors': authors,
+            'Title': title,
+            'Where': where,
+            'Id': pub.attrs.get('id'),}
+
+def search(search_string=STRINGS_FOR_TEST):
+    '''
+    returns the information found in a search query to dblp as a pandas dataframe.
+    Shows the following information:
+        - Authors
+        - Link to Publication
+        - Title
+        - Type (Article, Proceedings etc.)
+        - Where it was published
+        - Year of publication
+    :param search_string: A List of Strings of Keywords, that should be searched for
+    :return: pd.DataFrame: A Dataframe with all data
+    '''
+    soup = query_db(search_string)
+    pub_list_raw = soup.find("ul", attrs={"class": "publ-list"})
+
+    pub_list_data = []
+    curr_year = 0
+    for child in pub_list_raw.children:
+        pub_data = get_pub_data(child)
+        if type(pub_data) == int:
+            curr_year = pub_data
+        else:
+            pub_data['Year'] = curr_year
+            pub_list_data.append(pub_data)
+
+    return pd.DataFrame(pub_list_data)
author	Matthew Sotoudeh <masotoudeh@ucdavis.edu>	2020-09-16 13:25:47 -0700
committer	GitHub <noreply@github.com>	2020-09-16 13:25:47 -0700
commit	5d083c8ff8794c598f1e31c24d7f74f36b86cae9 (patch)
tree	7dd901089aae430fa7c36bbbae61ec5e973da3bb /dblp.py
parent	696252c38a939599cc2f0ea8a05b135120e725d3 (diff)