summaryrefslogtreecommitdiff
path: root/dblp.py
diff options
context:
space:
mode:
authorMatthew Sotoudeh <masotoudeh@ucdavis.edu>2020-09-16 13:25:47 -0700
committerGitHub <noreply@github.com>2020-09-16 13:25:47 -0700
commit5d083c8ff8794c598f1e31c24d7f74f36b86cae9 (patch)
tree7dd901089aae430fa7c36bbbae61ec5e973da3bb /dblp.py
parent696252c38a939599cc2f0ea8a05b135120e725d3 (diff)
Add a script for automatically DBLP-ifying references (#5)
Also added documentation for it in the README.
Diffstat (limited to 'dblp.py')
-rw-r--r--dblp.py90
1 files changed, 90 insertions, 0 deletions
diff --git a/dblp.py b/dblp.py
new file mode 100644
index 0000000..d6dc859
--- /dev/null
+++ b/dblp.py
@@ -0,0 +1,90 @@
+"""DBLP web scraper.
+
+COPIED UNDER MIT LICENSE
+https://github.com/sebastianGehrmann/dblp-pub
+
+Slightly modified to support the 'Id' property.
+"""
+from bs4 import BeautifulSoup
+import pandas as pd
+import requests
+
+#options
+STRINGS_FOR_TEST = ["Collaborative Writing"]
+DBLP_BASE_URL = 'http://dblp.uni-trier.de/'
+PUB_SEARCH_URL = DBLP_BASE_URL + "search/publ/"
+
+
+def query_db(pub_string=STRINGS_FOR_TEST):
+ '''
+ returns the BeautifulSoup object of a query to DBLP
+ :param pub_string: A list of strings of keywords
+ :return: BeautifulSoup: A BeautifulSoup Object
+ '''
+ resp = requests.get(PUB_SEARCH_URL, params={'q':pub_string})
+ return BeautifulSoup(resp.content, features="lxml")
+
+def get_pub_data(pub):
+ '''
+ Extracts the information about a publication from a BeautifulSoup object
+ :param pub: A BeautifulSoup Object with Publication Information
+ :return: dict: All Information of this Publication
+ '''
+ ptype = 'nothing'
+ link = 'nothing'
+ authors = []
+ title = 'nothing'
+ where = 'nothing'
+
+ if 'year' in pub.get('class'):
+ # year is not always scrapable, except for this case. Might be done more elegantly
+ return int(pub.contents[0])
+ else:
+ ptype = pub.attrs.get('class')[1]
+ for content_item in pub.contents:
+ class_of_content_item = content_item.attrs.get('class', [0])
+ if 'data' in class_of_content_item:
+ for author in content_item.findAll('span', attrs={"itemprop": "author"}):
+ authors.append(author.text)
+ title = content_item.find('span', attrs={"class": "title"}).text
+ for where_data in content_item.findAll('span', attrs={"itemprop": "isPartOf"}):
+ found_where = where_data.find('span', attrs={"itemprop": "name"})
+ if found_where:
+ where = found_where.text
+ elif 'publ' in class_of_content_item:
+ link = content_item.contents[0].find('a').attrs.get('href', "nothing")
+
+ return {'Type': ptype,
+ 'Link': link,
+ 'Authors': authors,
+ 'Title': title,
+ 'Where': where,
+ 'Id': pub.attrs.get('id'),}
+
+def search(search_string=STRINGS_FOR_TEST):
+ '''
+ returns the information found in a search query to dblp as a pandas dataframe.
+ Shows the following information:
+ - Authors
+ - Link to Publication
+ - Title
+ - Type (Article, Proceedings etc.)
+ - Where it was published
+ - Year of publication
+ :param search_string: A List of Strings of Keywords, that should be searched for
+ :return: pd.DataFrame: A Dataframe with all data
+ '''
+ soup = query_db(search_string)
+ pub_list_raw = soup.find("ul", attrs={"class": "publ-list"})
+
+ pub_list_data = []
+ curr_year = 0
+ for child in pub_list_raw.children:
+ pub_data = get_pub_data(child)
+ if type(pub_data) == int:
+ curr_year = pub_data
+ else:
+ pub_data['Year'] = curr_year
+ pub_list_data.append(pub_data)
+
+ return pd.DataFrame(pub_list_data)
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback