From b575f898c0957475fcf6ecb179315b14617f39e1 Mon Sep 17 00:00:00 2001 From: dadosch Date: Sat, 18 Aug 2018 19:24:02 +0200 Subject: [PATCH] duden.de engine --- searx/engines/duden.py | 76 ++++++++++++++++++++++++++++++++ searx/settings.yml | 5 +++ tests/unit/engines/test_duden.py | 41 +++++++++++++++++ 3 files changed, 122 insertions(+) create mode 100644 searx/engines/duden.py create mode 100644 tests/unit/engines/test_duden.py diff --git a/searx/engines/duden.py b/searx/engines/duden.py new file mode 100644 index 00000000..881ff9d9 --- /dev/null +++ b/searx/engines/duden.py @@ -0,0 +1,76 @@ +""" + Duden + @website https://www.duden.de + @provide-api no + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content +""" + +from lxml import html, etree +import re +from searx.engines.xpath import extract_text +from searx.url_utils import quote +from searx import logger + +categories = ['general'] +paging = True +language_support = False + +# search-url +base_url = 'https://www.duden.de/' +search_url = base_url + 'suchen/dudenonline/{query}?page={offset}' + + +def request(query, params): + '''pre-request callback + params: + method : POST/GET + headers : {} + data : {} # if method == POST + url : '' + category: 'search category' + pageno : 1 # number of the requested page + ''' + + offset = (params['pageno'] - 1) + params['url'] = search_url.format(offset=offset, query=quote(query)) + return params + + +def response(resp): + '''post-response callback + resp: requests response object + ''' + results = [] + + dom = html.fromstring(resp.text) + + try: + number_of_results_string = re.sub('[^0-9]', '', dom.xpath( + '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0] + ) + + results.append({'number_of_results': int(number_of_results_string)}) + + except: + logger.debug("Couldn't read number of results.") + pass + + for result in dom.xpath('//section[@class="wide" and not(contains(@style,"overflow:hidden"))]'): + try: + logger.debug("running for %s" % str(result)) + link = result.xpath('.//h2/a')[0] + url = link.attrib.get('href') + title = result.xpath('string(.//h2/a)') + content = extract_text(result.xpath('.//p')) + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + except: + logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) + continue + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 0bd16ca2..54b3d622 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -714,6 +714,11 @@ engines: shortcut : 1337x disabled : True + - name : Duden + engine : duden + shortcut : du + disabled : True + # - name : yacy # engine : yacy # shortcut : ya diff --git a/tests/unit/engines/test_duden.py b/tests/unit/engines/test_duden.py new file mode 100644 index 00000000..d9bbfef8 --- /dev/null +++ b/tests/unit/engines/test_duden.py @@ -0,0 +1,41 @@ +from collections import defaultdict +import mock +from searx.engines import duden +from searx.testing import SearxTestCase +from datetime import datetime + + +class TestDudenEngine(SearxTestCase): + + def test_request(self): + query = 'Haus' + dic = defaultdict(dict) + dic['pageno'] = 1 + params = duden.request(query, dic) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('duden.de' in params['url']) + + def test_response(self): + resp = mock.Mock(text='') + self.assertEqual(duden.response(resp), []) + + html = """ +
+

This is the title also here

+

This is the content

+ Zum vollständigen Artikel +
+ """ + + resp = mock.Mock(text=html) + results = duden.response(resp) + + self.assertEqual(len(results), 1) + self.assertEqual(type(results), list) + + # testing result (dictionary entry) + r = results[0] + self.assertEqual(r['url'], 'https://this.is.the.url/') + self.assertEqual(r['title'], 'This is the title also here') + self.assertEqual(r['content'], 'This is the content')