From 5d083c8ff8794c598f1e31c24d7f74f36b86cae9 Mon Sep 17 00:00:00 2001
From: Matthew Sotoudeh <masotoudeh@ucdavis.edu>
Date: Wed, 16 Sep 2020 13:25:47 -0700
Subject: Add a script for automatically DBLP-ifying references (#5)

Also added documentation for it in the README.
---
 dblp.py | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 dblp.py

(limited to 'dblp.py')

diff --git a/dblp.py b/dblp.py
new file mode 100644
index 0000000..d6dc859
--- /dev/null
+++ b/dblp.py
@@ -0,0 +1,90 @@
+"""DBLP web scraper.
+
+COPIED UNDER MIT LICENSE
+https://github.com/sebastianGehrmann/dblp-pub
+
+Slightly modified to support the 'Id' property.
+"""
+from bs4 import BeautifulSoup
+import pandas as pd
+import requests
+
+#options
+STRINGS_FOR_TEST = ["Collaborative Writing"]
+DBLP_BASE_URL = 'http://dblp.uni-trier.de/'
+PUB_SEARCH_URL = DBLP_BASE_URL + "search/publ/"
+
+
+def query_db(pub_string=STRINGS_FOR_TEST):
+    '''
+    returns the BeautifulSoup object of a query to DBLP
+    :param pub_string: A list of strings of keywords
+    :return: BeautifulSoup: A BeautifulSoup Object
+    '''
+    resp = requests.get(PUB_SEARCH_URL, params={'q':pub_string})
+    return BeautifulSoup(resp.content, features="lxml")
+
+def get_pub_data(pub):
+    '''
+    Extracts the information about a publication from a BeautifulSoup object
+    :param pub: A BeautifulSoup Object with Publication Information
+    :return: dict: All Information of this Publication
+    '''
+    ptype = 'nothing'
+    link = 'nothing'
+    authors = []
+    title = 'nothing'
+    where = 'nothing'
+
+    if 'year' in pub.get('class'):
+        # year is not always scrapable, except for this case. Might be done more elegantly
+        return int(pub.contents[0])
+    else:
+        ptype = pub.attrs.get('class')[1]
+        for content_item in pub.contents:
+            class_of_content_item = content_item.attrs.get('class', [0])
+            if 'data' in class_of_content_item:
+                for author in content_item.findAll('span', attrs={"itemprop": "author"}):
+                    authors.append(author.text)
+                title = content_item.find('span', attrs={"class": "title"}).text
+                for where_data in content_item.findAll('span', attrs={"itemprop": "isPartOf"}):
+                    found_where = where_data.find('span', attrs={"itemprop": "name"})
+                    if found_where:
+                        where = found_where.text
+            elif 'publ' in class_of_content_item:
+                link = content_item.contents[0].find('a').attrs.get('href', "nothing")
+
+    return {'Type': ptype,
+            'Link': link,
+            'Authors': authors,
+            'Title': title,
+            'Where': where,
+            'Id': pub.attrs.get('id'),}
+
+def search(search_string=STRINGS_FOR_TEST):
+    '''
+    returns the information found in a search query to dblp as a pandas dataframe.
+    Shows the following information:
+        - Authors
+        - Link to Publication
+        - Title
+        - Type (Article, Proceedings etc.)
+        - Where it was published
+        - Year of publication
+    :param search_string: A List of Strings of Keywords, that should be searched for
+    :return: pd.DataFrame: A Dataframe with all data
+    '''
+    soup = query_db(search_string)
+    pub_list_raw = soup.find("ul", attrs={"class": "publ-list"})
+
+    pub_list_data = []
+    curr_year = 0
+    for child in pub_list_raw.children:
+        pub_data = get_pub_data(child)
+        if type(pub_data) == int:
+            curr_year = pub_data
+        else:
+            pub_data['Year'] = curr_year
+            pub_list_data.append(pub_data)
+
+    return pd.DataFrame(pub_list_data)
-- 
cgit v1.2.3