From 5d083c8ff8794c598f1e31c24d7f74f36b86cae9 Mon Sep 17 00:00:00 2001 From: Matthew Sotoudeh Date: Wed, 16 Sep 2020 13:25:47 -0700 Subject: Add a script for automatically DBLP-ifying references (#5) Also added documentation for it in the README. --- dblp.py | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 dblp.py (limited to 'dblp.py') diff --git a/dblp.py b/dblp.py new file mode 100644 index 0000000..d6dc859 --- /dev/null +++ b/dblp.py @@ -0,0 +1,90 @@ +"""DBLP web scraper. + +COPIED UNDER MIT LICENSE +https://github.com/sebastianGehrmann/dblp-pub + +Slightly modified to support the 'Id' property. +""" +from bs4 import BeautifulSoup +import pandas as pd +import requests + +#options +STRINGS_FOR_TEST = ["Collaborative Writing"] +DBLP_BASE_URL = 'http://dblp.uni-trier.de/' +PUB_SEARCH_URL = DBLP_BASE_URL + "search/publ/" + + +def query_db(pub_string=STRINGS_FOR_TEST): + ''' + returns the BeautifulSoup object of a query to DBLP + :param pub_string: A list of strings of keywords + :return: BeautifulSoup: A BeautifulSoup Object + ''' + resp = requests.get(PUB_SEARCH_URL, params={'q':pub_string}) + return BeautifulSoup(resp.content, features="lxml") + +def get_pub_data(pub): + ''' + Extracts the information about a publication from a BeautifulSoup object + :param pub: A BeautifulSoup Object with Publication Information + :return: dict: All Information of this Publication + ''' + ptype = 'nothing' + link = 'nothing' + authors = [] + title = 'nothing' + where = 'nothing' + + if 'year' in pub.get('class'): + # year is not always scrapable, except for this case. Might be done more elegantly + return int(pub.contents[0]) + else: + ptype = pub.attrs.get('class')[1] + for content_item in pub.contents: + class_of_content_item = content_item.attrs.get('class', [0]) + if 'data' in class_of_content_item: + for author in content_item.findAll('span', attrs={"itemprop": "author"}): + authors.append(author.text) + title = content_item.find('span', attrs={"class": "title"}).text + for where_data in content_item.findAll('span', attrs={"itemprop": "isPartOf"}): + found_where = where_data.find('span', attrs={"itemprop": "name"}) + if found_where: + where = found_where.text + elif 'publ' in class_of_content_item: + link = content_item.contents[0].find('a').attrs.get('href', "nothing") + + return {'Type': ptype, + 'Link': link, + 'Authors': authors, + 'Title': title, + 'Where': where, + 'Id': pub.attrs.get('id'),} + +def search(search_string=STRINGS_FOR_TEST): + ''' + returns the information found in a search query to dblp as a pandas dataframe. + Shows the following information: + - Authors + - Link to Publication + - Title + - Type (Article, Proceedings etc.) + - Where it was published + - Year of publication + :param search_string: A List of Strings of Keywords, that should be searched for + :return: pd.DataFrame: A Dataframe with all data + ''' + soup = query_db(search_string) + pub_list_raw = soup.find("ul", attrs={"class": "publ-list"}) + + pub_list_data = [] + curr_year = 0 + for child in pub_list_raw.children: + pub_data = get_pub_data(child) + if type(pub_data) == int: + curr_year = pub_data + else: + pub_data['Year'] = curr_year + pub_list_data.append(pub_data) + + return pd.DataFrame(pub_list_data) -- cgit v1.2.3