Skip to content
Snippets Groups Projects
Select Git revision
  • baf66eb9817edf5f86b5622c74cd4c203438a2c7
  • main default
  • master
3 results

01_fetch_list.py

Blame
  • 03_fetch_s2.py 1.99 KiB
    import re
    import json
    import urllib.parse
    
    import requests
    
    
    def tokenise(s):
        return re.sub(r'\s+', ' ', re.sub(r'\W', ' ', s.lower())).split(' ')
    
    
    if __name__ == '__main__':
    
        with open('publications_02.jsonl', 'r') as f_in, \
                open('publications_03.jsonl', 'w') as f_out:
            for li, line in enumerate(f_in):
                print(f'Processing line {li}')
                pub = json.loads(line)
    
                tokenised_title = tokenise(pub['title'])
    
                query = '+'.join([urllib.parse.quote_plus(token) for token in tokenised_title])
                url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={query}'
                print(f'  - {url}')
    
                res = requests.get(url).json()
                # print(res)
                print(f'  - {res["total"]} publications found')
    
                if res['total'] > 0:
                    print(f'  -> {pub["title"]}')
                    # print(f'     {tokenised_title}')
                    print(f'  -> {res["data"][0]["title"]}')
    
                    tt = tokenise(res['data'][0]['title'])
                    overlap = set(tokenised_title).intersection(set(tt))
                    # print(f'     {tt}')
    
                    print(f'  -> Overlap: {len(overlap)} tokens ({len(overlap) / len(set(tokenised_title)):.2%})')
    
                    if (len(overlap) / len(set(tokenised_title))) > 0.8:
                        pid = res['data'][0]['paperId']
                        url_details = f'https://api.semanticscholar.org/graph/v1/paper/{pid}' \
                                      f'?fields=corpusId,url,title,authors,venue,publicationVenue,' \
                                      f'year,externalIds,abstract,publicationTypes,journal,' \
                                      f'publicationDate,openAccessPdf,citationStyles'
                        pub['s2'] = requests.get(url_details).json()
                    else:
                        print('  -> SKIP (overlap too small)')
                else:
                    print('  -> SKIP (no hits)')
    
                f_out.write(json.dumps(pub) + '\n')
    
        print('Finished!')