[enh][mod] search refactor

This commit is contained in:
Adam Tauber 2014-07-07 13:59:27 +02:00
parent a07b2b514c
commit b0ba367a1a
4 changed files with 170 additions and 174 deletions

View File

@ -19,19 +19,12 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
from os.path import realpath, dirname, splitext, join
import sys
from imp import load_source
from itertools import izip_longest, chain
from operator import itemgetter
from urlparse import urlparse, unquote
from datetime import datetime
import grequests
from flask.ext.babel import gettext
from operator import itemgetter
from searx import settings
from searx.utils import gen_useragent
engine_dir = dirname(realpath(__file__))
number_of_searches = 0
engines = {}
categories = {'general': []}
@ -114,160 +107,6 @@ for engine_data in settings['engines']:
engine_shortcuts[engine.shortcut] = engine.name
def default_request_params():
return {
'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}}
def make_callback(engine_name, results, suggestions, callback, params):
# creating a callback wrapper for the search engine results
def process_callback(response, **kwargs):
cb_res = []
response.search_params = params
engines[engine_name].stats['page_load_time'] += \
(datetime.now() - params['started']).total_seconds()
try:
search_results = callback(response)
except Exception, e:
engines[engine_name].stats['errors'] += 1
results[engine_name] = cb_res
print '[E] Error with engine "{0}":\n\t{1}'.format(
engine_name, str(e))
return
for result in search_results:
result['engine'] = engine_name
if 'suggestion' in result:
# TODO type checks
suggestions.add(result['suggestion'])
continue
cb_res.append(result)
results[engine_name] = cb_res
return process_callback
def score_results(results):
flat_res = filter(
None, chain.from_iterable(izip_longest(*results.values())))
flat_len = len(flat_res)
engines_len = len(results)
results = []
# deduplication + scoring
for i, res in enumerate(flat_res):
res['parsed_url'] = urlparse(res['url'])
res['host'] = res['parsed_url'].netloc
if res['host'].startswith('www.'):
res['host'] = res['host'].replace('www.', '', 1)
res['engines'] = [res['engine']]
weight = 1.0
if hasattr(engines[res['engine']], 'weight'):
weight = float(engines[res['engine']].weight)
score = int((flat_len - i) / engines_len) * weight + 1
duplicated = False
for new_res in results:
p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa
p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa
if res['host'] == new_res['host'] and\
unquote(p1) == unquote(p2) and\
res['parsed_url'].query == new_res['parsed_url'].query and\
res.get('template') == new_res.get('template'):
duplicated = new_res
break
if duplicated:
if res.get('content') > duplicated.get('content'):
duplicated['content'] = res['content']
duplicated['score'] += score
duplicated['engines'].append(res['engine'])
if duplicated['parsed_url'].scheme == 'https':
continue
elif res['parsed_url'].scheme == 'https':
duplicated['url'] = res['parsed_url'].geturl()
duplicated['parsed_url'] = res['parsed_url']
else:
res['score'] = score
results.append(res)
return sorted(results, key=itemgetter('score'), reverse=True)
def search(query, request, selected_engines, pageno=1, lang='all'):
global engines, categories, number_of_searches
requests = []
results = {}
suggestions = set()
number_of_searches += 1
#user_agent = request.headers.get('User-Agent', '')
user_agent = gen_useragent()
for selected_engine in selected_engines:
if selected_engine['name'] not in engines:
continue
engine = engines[selected_engine['name']]
if pageno > 1 and not engine.paging:
continue
if lang != 'all' and not engine.language_support:
continue
request_params = default_request_params()
request_params['headers']['User-Agent'] = user_agent
request_params['category'] = selected_engine['category']
request_params['started'] = datetime.now()
request_params['pageno'] = pageno
request_params['language'] = lang
request_params = engine.request(query.encode('utf-8'), request_params)
if request_params['url'] is None:
# TODO add support of offline engines
pass
callback = make_callback(
selected_engine['name'],
results,
suggestions,
engine.response,
request_params
)
request_args = dict(
headers=request_params['headers'],
hooks=dict(response=callback),
cookies=request_params['cookies'],
timeout=engine.timeout
)
if request_params['method'] == 'GET':
req = grequests.get
else:
req = grequests.post
request_args['data'] = request_params['data']
# ignoring empty urls
if not request_params['url']:
continue
requests.append(req(request_params['url'], **request_args))
grequests.map(requests)
for engine_name, engine_results in results.items():
engines[engine_name].stats['search_count'] += 1
engines[engine_name].stats['result_count'] += len(engine_results)
results = score_results(results)
for result in results:
for res_engine in result['engines']:
engines[result['engine']].stats['score_count'] += result['score']
return results, suggestions
def get_engines_stats():
# TODO refactor
pageloads = []

View File

@ -1,7 +1,96 @@
import grequests
from itertools import izip_longest, chain
from datetime import datetime
from operator import itemgetter
from urlparse import urlparse, unquote
from searx.engines import (
categories, engines, engine_shortcuts
)
from searx.languages import language_codes
from searx.utils import gen_useragent
number_of_searches = 0
def default_request_params():
return {
'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}}
def make_callback(engine_name, results, suggestions, callback, params):
# creating a callback wrapper for the search engine results
def process_callback(response, **kwargs):
cb_res = []
response.search_params = params
engines[engine_name].stats['page_load_time'] += \
(datetime.now() - params['started']).total_seconds()
try:
search_results = callback(response)
except Exception, e:
engines[engine_name].stats['errors'] += 1
results[engine_name] = cb_res
print '[E] Error with engine "{0}":\n\t{1}'.format(
engine_name, str(e))
return
for result in search_results:
result['engine'] = engine_name
if 'suggestion' in result:
# TODO type checks
suggestions.add(result['suggestion'])
continue
cb_res.append(result)
results[engine_name] = cb_res
return process_callback
def score_results(results):
flat_res = filter(
None, chain.from_iterable(izip_longest(*results.values())))
flat_len = len(flat_res)
engines_len = len(results)
results = []
# deduplication + scoring
for i, res in enumerate(flat_res):
res['parsed_url'] = urlparse(res['url'])
res['host'] = res['parsed_url'].netloc
if res['host'].startswith('www.'):
res['host'] = res['host'].replace('www.', '', 1)
res['engines'] = [res['engine']]
weight = 1.0
if hasattr(engines[res['engine']], 'weight'):
weight = float(engines[res['engine']].weight)
score = int((flat_len - i) / engines_len) * weight + 1
duplicated = False
for new_res in results:
p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa
p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa
if res['host'] == new_res['host'] and\
unquote(p1) == unquote(p2) and\
res['parsed_url'].query == new_res['parsed_url'].query and\
res.get('template') == new_res.get('template'):
duplicated = new_res
break
if duplicated:
if res.get('content') > duplicated.get('content'):
duplicated['content'] = res['content']
duplicated['score'] += score
duplicated['engines'].append(res['engine'])
if duplicated['parsed_url'].scheme == 'https':
continue
elif res['parsed_url'].scheme == 'https':
duplicated['url'] = res['parsed_url'].geturl()
duplicated['parsed_url'] = res['parsed_url']
else:
res['score'] = score
results.append(res)
return sorted(results, key=itemgetter('score'), reverse=True)
class Search(object):
@ -112,3 +201,77 @@ class Search(object):
if modified:
self.query = self.query.replace(query_parts[0], '', 1).strip()
self.parse_query()
def search(self, request):
global number_of_searches
requests = []
results = {}
suggestions = set()
number_of_searches += 1
#user_agent = request.headers.get('User-Agent', '')
user_agent = gen_useragent()
for selected_engine in self.engines:
if selected_engine['name'] not in engines:
continue
engine = engines[selected_engine['name']]
if self.pageno > 1 and not engine.paging:
continue
if self.lang != 'all' and not engine.language_support:
continue
request_params = default_request_params()
request_params['headers']['User-Agent'] = user_agent
request_params['category'] = selected_engine['category']
request_params['started'] = datetime.now()
request_params['pageno'] = self.pageno
request_params['language'] = self.lang
request_params = engine.request(self.query.encode('utf-8'),
request_params)
if request_params['url'] is None:
# TODO add support of offline engines
pass
callback = make_callback(
selected_engine['name'],
results,
suggestions,
engine.response,
request_params
)
request_args = dict(
headers=request_params['headers'],
hooks=dict(response=callback),
cookies=request_params['cookies'],
timeout=engine.timeout
)
if request_params['method'] == 'GET':
req = grequests.get
else:
req = grequests.post
request_args['data'] = request_params['data']
# ignoring empty urls
if not request_params['url']:
continue
requests.append(req(request_params['url'], **request_args))
grequests.map(requests)
for engine_name, engine_results in results.items():
engines[engine_name].stats['search_count'] += 1
engines[engine_name].stats['result_count'] += len(engine_results)
results = score_results(results)
for result in results:
for res_engine in result['engines']:
engines[result['engine']]\
.stats['score_count'] += result['score']
return results, suggestions

View File

@ -39,7 +39,7 @@ class ViewsTestCase(SearxTestCase):
self.assertEqual(result.status_code, 200)
self.assertIn('<div class="title"><h1>searx</h1></div>', result.data)
@patch('searx.webapp.do_search')
@patch('searx.search.Search.search')
def test_index_html(self, search):
search.return_value = (
self.test_results,
@ -55,7 +55,7 @@ class ViewsTestCase(SearxTestCase):
result.data
)
@patch('searx.webapp.do_search')
@patch('searx.search.Search.search')
def test_index_json(self, search):
search.return_value = (
self.test_results,
@ -71,7 +71,7 @@ class ViewsTestCase(SearxTestCase):
self.assertEqual(
result_dict['results'][0]['url'], 'http://first.test.xyz')
@patch('searx.webapp.do_search')
@patch('searx.search.Search.search')
def test_index_csv(self, search):
search.return_value = (
self.test_results,
@ -86,7 +86,7 @@ class ViewsTestCase(SearxTestCase):
result.data
)
@patch('searx.webapp.do_search')
@patch('searx.search.Search.search')
def test_index_rss(self, search):
search.return_value = (
self.test_results,

View File

@ -39,8 +39,7 @@ from flask import (
from flask.ext.babel import Babel, gettext, format_date
from searx import settings, searx_dir
from searx.engines import (
search as do_search, categories, engines, get_engines_stats,
engine_shortcuts
categories, engines, get_engines_stats, engine_shortcuts
)
from searx.utils import (
UnicodeWriter, highlight_content, html_to_text, get_themes
@ -191,12 +190,7 @@ def index():
'index.html',
)
# TODO moar refactor - do_search integration into Search class
search.results, search.suggestions = do_search(search.query,
request,
search.engines,
search.pageno,
search.lang)
search.results, search.suggestions = search.search(request)
for result in search.results: