From ee1ffbc87f0b2471e33b14561002c400cd8db861 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Tue, 4 Mar 2014 14:19:59 +0100 Subject: [PATCH 1/3] [fix] yahoo engine url extraction --- searx/engines/yahoo.py | 2 +- searx/engines/yahoo_news.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index f83b4b96..f070b8a7 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -35,7 +35,7 @@ def response(resp): for result in dom.xpath(results_xpath): url_string = extract_url(result.xpath(url_xpath), search_url) - start = url_string.find('/RU=')+4 + start = url_string.find('http', url_string.find('/RU=')+1) end = url_string.rfind('/RS') url = unquote(url_string[start:end]) title = extract_text(result.xpath(title_xpath)[0]) diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py index 6ece496c..3c257866 100644 --- a/searx/engines/yahoo_news.py +++ b/searx/engines/yahoo_news.py @@ -35,7 +35,7 @@ def response(resp): for result in dom.xpath(results_xpath): url_string = extract_url(result.xpath(url_xpath), search_url) - start = url_string.find('/RU=')+4 + start = url_string.find('http', url_string.find('/RU=')+1) end = url_string.rfind('/RS') url = unquote(url_string[start:end]) title = extract_text(result.xpath(title_xpath)[0]) From 98b6313d5dd073fcdeaad52c684f31c2cabc3715 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Tue, 4 Mar 2014 14:20:29 +0100 Subject: [PATCH 2/3] [fix] pep8 --- searx/engines/google_news.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index ca27a5b2..93571860 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -6,7 +6,7 @@ from json import loads categories = ['news'] url = 'https://ajax.googleapis.com/' -search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}' # noqa +search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}' # noqa paging = True language_support = True From 71c2e8222bc5d7115e8e2ed415057f66da3a2f09 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Tue, 4 Mar 2014 14:20:37 +0100 Subject: [PATCH 3/3] [enh] better useragent string generation --- searx/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/searx/utils.py b/searx/utils.py index af8ce952..b99a945d 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -4,12 +4,15 @@ import csv from codecs import getincrementalencoder import cStringIO import re +from random import choice +ua_versions = ('26.0', '27.0', '28.0') +ua_os = ('Windows NT 6.3; WOW64', 'X11; Linux x86_64; rv:26.0') +ua = "Mozilla/5.0 ({os}) Gecko/20100101 Firefox/{version}" def gen_useragent(): # TODO - ua = "Mozilla/5.0 (X11; Linux x86_64; rv:26.0) Gecko/20100101 Firefox/26.0" - return ua + return ua.format(os=choice(ua_os), version=choice(ua_versions)) def highlight_content(content, query):