"git@gitlab.pik-potsdam.de:stenzel/biospheremetrics.git" did not exist on "bcbbc29ea7dcd8f55136d40e90355eecb83c324b"
Select Git revision
01_fetch_list.py
03_fetch_s2.py 1.99 KiB
import re
import json
import urllib.parse
import requests
def tokenise(s):
return re.sub(r'\s+', ' ', re.sub(r'\W', ' ', s.lower())).split(' ')
if __name__ == '__main__':
with open('publications_02.jsonl', 'r') as f_in, \
open('publications_03.jsonl', 'w') as f_out:
for li, line in enumerate(f_in):
print(f'Processing line {li}')
pub = json.loads(line)
tokenised_title = tokenise(pub['title'])
query = '+'.join([urllib.parse.quote_plus(token) for token in tokenised_title])
url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={query}'
print(f' - {url}')
res = requests.get(url).json()
# print(res)
print(f' - {res["total"]} publications found')
if res['total'] > 0:
print(f' -> {pub["title"]}')
# print(f' {tokenised_title}')
print(f' -> {res["data"][0]["title"]}')
tt = tokenise(res['data'][0]['title'])
overlap = set(tokenised_title).intersection(set(tt))
# print(f' {tt}')
print(f' -> Overlap: {len(overlap)} tokens ({len(overlap) / len(set(tokenised_title)):.2%})')
if (len(overlap) / len(set(tokenised_title))) > 0.8:
pid = res['data'][0]['paperId']
url_details = f'https://api.semanticscholar.org/graph/v1/paper/{pid}' \
f'?fields=corpusId,url,title,authors,venue,publicationVenue,' \
f'year,externalIds,abstract,publicationTypes,journal,' \
f'publicationDate,openAccessPdf,citationStyles'
pub['s2'] = requests.get(url_details).json()
else:
print(' -> SKIP (overlap too small)')
else:
print(' -> SKIP (no hits)')
f_out.write(json.dumps(pub) + '\n')
print('Finished!')