[add] arxiv engine

This commit is contained in:
jibe-b 2017-09-23 14:16:06 +02:00 committed by Noémi Ványi
parent 6d28e9d694
commit 3e3672e079
3 changed files with 137 additions and 0 deletions

73
searx/engines/arxiv.py Normal file
View File

@ -0,0 +1,73 @@
#!/usr/bin/env python
"""
ArXiV (Scientific preprints)
@website https://axiv.org
@provide-api yes (export.arxiv.org/api/query)
@using-api yes
@results XML-RSS
@stable yes
@parse url, title, publishedDate, content
More info on api: https://arxiv.org/help/api/user-manual
"""
from lxml import html
from datetime import datetime
from searx.url_utils import urlencode
categories = ['science']
base_url = 'http://export.arxiv.org/api/query?search_query=all:'\
+ '{query}&start={offset}&max_results={number_of_results}'
# engine dependent config
number_of_results = 10
def request(query, params):
# basic search
offset = (params['pageno'] - 1) * number_of_results
string_args = dict(query=query,
offset=offset,
number_of_results=number_of_results)
params['url'] = base_url.format(**string_args)
return params
def response(resp):
results = []
search_results = html.fromstring(resp.text.encode('utf-8')).xpath('//entry')
for entry in search_results:
title = entry.xpath('.//title')[0].text
url = entry.xpath('.//id')[0].text
content = entry.xpath('.//summary')[0].text
# If a doi is available, add it to the snipppet
try:
doi = entry.xpath('.//link[@title="doi"]')[0].text
content = 'DOI: ' + doi + ' Abstract: ' + content
except:
pass
if len(content) > 300:
content = content[0:300] + "..."
# TODO: center snippet on query term
publishedDate = datetime.strptime(entry.xpath('.//published')[0].text, '%Y-%m-%dT%H:%M:%SZ')
res_dict = {'url': url,
'title': title,
'publishedDate': publishedDate,
'content': content}
results.append(res_dict)
return results

View File

@ -60,6 +60,12 @@ engines:
disabled : True
shortcut : ai
- name : arxiv
engine : arxiv
shortcut : arx
categories : science
timeout : 4.0
- name : base
engine : base
shortcut : bs

View File

@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
from collections import defaultdict
import mock
from searx.engines import arxiv
from searx.testing import SearxTestCase
class TestBaseEngine(SearxTestCase):
def test_request(self):
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
params = arxiv.request(query, dicto)
self.assertIn('url', params)
self.assertIn('export.arxiv.org/api/', params['url'])
def test_response(self):
self.assertRaises(AttributeError, arxiv.response, None)
self.assertRaises(AttributeError, arxiv.response, [])
self.assertRaises(AttributeError, arxiv.response, '')
self.assertRaises(AttributeError, arxiv.response, '[]')
response = mock.Mock(text='''<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom"></feed>''')
self.assertEqual(arxiv.response(response), [])
xml_mock = '''<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title type="html">ArXiv Query: search_query=all:test_query&amp;id_list=&amp;start=0&amp;max_results=1</title>
<id>http://arxiv.org/api/1</id>
<updated>2000-01-21T00:00:00-01:00</updated>
<opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:totalResults>
<opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>
<opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:itemsPerPage>
<entry>
<id>http://arxiv.org/1</id>
<updated>2000-01-01T00:00:01Z</updated>
<published>2000-01-01T00:00:01Z</published>
<title>Mathematical proof.</title>
<summary>Mathematical formula.</summary>
<author>
<name>A. B.</name>
</author>
<link href="http://arxiv.org/1" rel="alternate" type="text/html"/>
<link title="pdf" href="http://arxiv.org/1" rel="related" type="application/pdf"/>
<category term="math.QA" scheme="http://arxiv.org/schemas/atom"/>
<category term="1" scheme="http://arxiv.org/schemas/atom"/>
</entry>
</feed>
'''
response = mock.Mock(text=xml_mock.encode('utf-8'))
results = arxiv.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 1)
self.assertEqual(results[0]['title'], 'Mathematical proof.')
self.assertEqual(results[0]['content'], 'Mathematical formula.')