Merge pull request #1700 from kvch/intro-offline-engines

Initialize support for offline engines
This commit is contained in:
Adam Tauber 2019-10-16 13:54:54 +00:00 committed by GitHub
commit 12f42d1572
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 227 additions and 61 deletions

View File

@ -27,7 +27,7 @@ from json import loads
from requests import get from requests import get
from searx import settings from searx import settings
from searx import logger from searx import logger
from searx.utils import load_module, match_language from searx.utils import load_module, match_language, get_engine_from_settings
logger = logger.getChild('engines') logger = logger.getChild('engines')
@ -53,7 +53,8 @@ engine_default_args = {'paging': False,
'disabled': False, 'disabled': False,
'suspend_end_time': 0, 'suspend_end_time': 0,
'continuous_errors': 0, 'continuous_errors': 0,
'time_range_support': False} 'time_range_support': False,
'offline': False}
def load_engine(engine_data): def load_engine(engine_data):
@ -128,14 +129,16 @@ def load_engine(engine_data):
engine.stats = { engine.stats = {
'result_count': 0, 'result_count': 0,
'search_count': 0, 'search_count': 0,
'page_load_time': 0,
'page_load_count': 0,
'engine_time': 0, 'engine_time': 0,
'engine_time_count': 0, 'engine_time_count': 0,
'score_count': 0, 'score_count': 0,
'errors': 0 'errors': 0
} }
if not engine.offline:
engine.stats['page_load_time'] = 0
engine.stats['page_load_count'] = 0
for category_name in engine.categories: for category_name in engine.categories:
categories.setdefault(category_name, []).append(engine) categories.setdefault(category_name, []).append(engine)
@ -173,11 +176,6 @@ def get_engines_stats():
results_num = \ results_num = \
engine.stats['result_count'] / float(engine.stats['search_count']) engine.stats['result_count'] / float(engine.stats['search_count'])
if engine.stats['page_load_count'] != 0:
load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count']) # noqa
else:
load_times = 0
if engine.stats['engine_time_count'] != 0: if engine.stats['engine_time_count'] != 0:
this_engine_time = engine.stats['engine_time'] / float(engine.stats['engine_time_count']) # noqa this_engine_time = engine.stats['engine_time'] / float(engine.stats['engine_time_count']) # noqa
else: else:
@ -189,14 +187,19 @@ def get_engines_stats():
else: else:
score = score_per_result = 0.0 score = score_per_result = 0.0
max_pageload = max(load_times, max_pageload) if not engine.offline:
load_times = 0
if engine.stats['page_load_count'] != 0:
load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count']) # noqa
max_pageload = max(load_times, max_pageload)
pageloads.append({'avg': load_times, 'name': engine.name})
max_engine_times = max(this_engine_time, max_engine_times) max_engine_times = max(this_engine_time, max_engine_times)
max_results = max(results_num, max_results) max_results = max(results_num, max_results)
max_score = max(score, max_score) max_score = max(score, max_score)
max_score_per_result = max(score_per_result, max_score_per_result) max_score_per_result = max(score_per_result, max_score_per_result)
max_errors = max(max_errors, engine.stats['errors']) max_errors = max(max_errors, engine.stats['errors'])
pageloads.append({'avg': load_times, 'name': engine.name})
engine_times.append({'avg': this_engine_time, 'name': engine.name}) engine_times.append({'avg': this_engine_time, 'name': engine.name})
results.append({'avg': results_num, 'name': engine.name}) results.append({'avg': results_num, 'name': engine.name})
scores.append({'avg': score, 'name': engine.name}) scores.append({'avg': score, 'name': engine.name})
@ -255,7 +258,7 @@ def initialize_engines(engine_list):
load_engines(engine_list) load_engines(engine_list)
def engine_init(engine_name, init_fn): def engine_init(engine_name, init_fn):
init_fn() init_fn(get_engine_from_settings(engine_name))
logger.debug('%s engine: Initialized', engine_name) logger.debug('%s engine: Initialized', engine_name)
for engine_name, engine in engines.items(): for engine_name, engine in engines.items():

View File

@ -66,7 +66,7 @@ def get_client_id():
return "" return ""
def init(): def init(engine_settings=None):
global guest_client_id global guest_client_id
# api-key # api-key
guest_client_id = get_client_id() guest_client_id = get_client_id()

View File

@ -55,7 +55,7 @@ def obtain_token():
return token return token
def init(): def init(engine_settings=None):
obtain_token() obtain_token()

View File

@ -225,6 +225,9 @@ def https_url_rewrite(result):
def on_result(request, search, result): def on_result(request, search, result):
if 'parsed_url' not in result:
return True
if result['parsed_url'].scheme == 'http': if result['parsed_url'].scheme == 'http':
https_url_rewrite(result) https_url_rewrite(result)
return True return True

View File

@ -35,6 +35,9 @@ def get_doi_resolver(args, preference_doi_resolver):
def on_result(request, search, result): def on_result(request, search, result):
if 'parsed_url' not in result:
return True
doi = extract_doi(result['parsed_url']) doi = extract_doi(result['parsed_url'])
if doi and len(doi) < 50: if doi and len(doi) < 50:
for suffix in ('/', '.pdf', '/full', '/meta', '/abstract'): for suffix in ('/', '.pdf', '/full', '/meta', '/abstract'):

View File

@ -30,6 +30,9 @@ preference_section = 'privacy'
def on_result(request, search, result): def on_result(request, search, result):
if 'parsed_url' not in result:
return True
query = result['parsed_url'].query query = result['parsed_url'].query
if query == "": if query == "":

View File

@ -197,6 +197,13 @@ class ResultContainer(object):
self.infoboxes.append(infobox) self.infoboxes.append(infobox)
def _merge_result(self, result, position): def _merge_result(self, result, position):
if 'url' in result:
self.__merge_url_result(result, position)
return
self.__merge_result_no_url(result, position)
def __merge_url_result(self, result, position):
result['parsed_url'] = urlparse(result['url']) result['parsed_url'] = urlparse(result['url'])
# if the result has no scheme, use http as default # if the result has no scheme, use http as default
@ -210,51 +217,60 @@ class ResultContainer(object):
if result.get('content'): if result.get('content'):
result['content'] = WHITESPACE_REGEX.sub(' ', result['content']) result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
# check for duplicates duplicated = self.__find_duplicated_http_result(result)
duplicated = False if duplicated:
self.__merge_duplicated_http_result(duplicated, result, position)
return
# if there is no duplicate found, append result
result['positions'] = [position]
with RLock():
self._merged_results.append(result)
def __find_duplicated_http_result(self, result):
result_template = result.get('template') result_template = result.get('template')
for merged_result in self._merged_results: for merged_result in self._merged_results:
if 'parsed_url' not in merged_result:
continue
if compare_urls(result['parsed_url'], merged_result['parsed_url'])\ if compare_urls(result['parsed_url'], merged_result['parsed_url'])\
and result_template == merged_result.get('template'): and result_template == merged_result.get('template'):
if result_template != 'images.html': if result_template != 'images.html':
# not an image, same template, same url : it's a duplicate # not an image, same template, same url : it's a duplicate
duplicated = merged_result return merged_result
break
else: else:
# it's an image # it's an image
# it's a duplicate if the parsed_url, template and img_src are differents # it's a duplicate if the parsed_url, template and img_src are differents
if result.get('img_src', '') == merged_result.get('img_src', ''): if result.get('img_src', '') == merged_result.get('img_src', ''):
duplicated = merged_result return merged_result
break return None
# merge duplicates together def __merge_duplicated_http_result(self, duplicated, result, position):
if duplicated: # using content with more text
# using content with more text if result_content_len(result.get('content', '')) >\
if result_content_len(result.get('content', '')) >\ result_content_len(duplicated.get('content', '')):
result_content_len(duplicated.get('content', '')): duplicated['content'] = result['content']
duplicated['content'] = result['content']
# merge all result's parameters not found in duplicate # merge all result's parameters not found in duplicate
for key in result.keys(): for key in result.keys():
if not duplicated.get(key): if not duplicated.get(key):
duplicated[key] = result.get(key) duplicated[key] = result.get(key)
# add the new position # add the new position
duplicated['positions'].append(position) duplicated['positions'].append(position)
# add engine to list of result-engines # add engine to list of result-engines
duplicated['engines'].add(result['engine']) duplicated['engines'].add(result['engine'])
# using https if possible # using https if possible
if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https': if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
duplicated['url'] = result['parsed_url'].geturl() duplicated['url'] = result['parsed_url'].geturl()
duplicated['parsed_url'] = result['parsed_url'] duplicated['parsed_url'] = result['parsed_url']
# if there is no duplicate found, append result def __merge_result_no_url(self, result, position):
else: result['engines'] = set([result['engine']])
result['positions'] = [position] result['positions'] = [position]
with RLock(): with RLock():
self._merged_results.append(result) self._merged_results.append(result)
def order_results(self): def order_results(self):
for result in self._merged_results: for result in self._merged_results:

View File

@ -77,7 +77,7 @@ def send_http_request(engine, request_params):
return req(request_params['url'], **request_args) return req(request_params['url'], **request_args)
def search_one_request(engine, query, request_params): def search_one_http_request(engine, query, request_params):
# update request parameters dependent on # update request parameters dependent on
# search-engine (contained in engines folder) # search-engine (contained in engines folder)
engine.request(query, request_params) engine.request(query, request_params)
@ -97,7 +97,53 @@ def search_one_request(engine, query, request_params):
return engine.response(response) return engine.response(response)
def search_one_offline_request(engine, query, request_params):
return engine.search(query, request_params)
def search_one_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit): def search_one_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit):
if engines[engine_name].offline:
return search_one_offline_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit) # noqa
return search_one_http_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit)
def search_one_offline_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit):
engine = engines[engine_name]
try:
search_results = search_one_offline_request(engine, query, request_params)
if search_results:
result_container.extend(engine_name, search_results)
engine_time = time() - start_time
result_container.add_timing(engine_name, engine_time, engine_time)
with threading.RLock():
engine.stats['engine_time'] += engine_time
engine.stats['engine_time_count'] += 1
except ValueError as e:
record_offline_engine_stats_on_error(engine, result_container, start_time)
logger.exception('engine {0} : invalid input : {1}'.format(engine_name, e))
except Exception as e:
record_offline_engine_stats_on_error(engine, result_container, start_time)
result_container.add_unresponsive_engine((
engine_name,
u'{0}: {1}'.format(gettext('unexpected crash'), e),
))
logger.exception('engine {0} : exception : {1}'.format(engine_name, e))
def record_offline_engine_stats_on_error(engine, result_container, start_time):
engine_time = time() - start_time
result_container.add_timing(engine.name, engine_time, engine_time)
with threading.RLock():
engine.stats['errors'] += 1
def search_one_http_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit):
# set timeout for all HTTP requests # set timeout for all HTTP requests
requests_lib.set_timeout_for_thread(timeout_limit, start_time=start_time) requests_lib.set_timeout_for_thread(timeout_limit, start_time=start_time)
# reset the HTTP total time # reset the HTTP total time
@ -111,7 +157,7 @@ def search_one_request_safe(engine_name, query, request_params, result_container
try: try:
# send requests and parse the results # send requests and parse the results
search_results = search_one_request(engine, query, request_params) search_results = search_one_http_request(engine, query, request_params)
# check if the engine accepted the request # check if the engine accepted the request
if search_results is not None: if search_results is not None:
@ -427,20 +473,22 @@ class Search(object):
continue continue
# set default request parameters # set default request parameters
request_params = default_request_params() request_params = {}
request_params['headers']['User-Agent'] = user_agent if not engine.offline:
request_params = default_request_params()
request_params['headers']['User-Agent'] = user_agent
if hasattr(engine, 'language') and engine.language:
request_params['language'] = engine.language
else:
request_params['language'] = search_query.lang
request_params['safesearch'] = search_query.safesearch
request_params['time_range'] = search_query.time_range
request_params['category'] = selected_engine['category'] request_params['category'] = selected_engine['category']
request_params['pageno'] = search_query.pageno request_params['pageno'] = search_query.pageno
if hasattr(engine, 'language') and engine.language:
request_params['language'] = engine.language
else:
request_params['language'] = search_query.lang
# 0 = None, 1 = Moderate, 2 = Strict
request_params['safesearch'] = search_query.safesearch
request_params['time_range'] = search_query.time_range
# append request to list # append request to list
requests.append((selected_engine['name'], search_query.query, request_params)) requests.append((selected_engine['name'], search_query.query, request_params))

File diff suppressed because one or more lines are too long

View File

@ -325,6 +325,10 @@ a {
font-size: 0.9em; font-size: 0.9em;
} }
.result .engines {
text-align: right;
}
.result .content { .result .content {
margin: 0; margin: 0;
color: #666; color: #666;

File diff suppressed because one or more lines are too long

View File

@ -376,6 +376,10 @@ table {
width: 100%; width: 100%;
} }
.result-table {
margin-bottom: 10px;
}
td { td {
padding: 0 4px; padding: 0 4px;
} }

View File

@ -0,0 +1,13 @@
<div class="result">
<table>
{% for key, value in result.items() %}
{% if key in ['engine', 'engines', 'template', 'score', 'category', 'positions'] %}
{% continue %}
{% endif %}
<tr>
<td><b>{{ key|upper }}</b>: {{ value|safe }}</td>
</tr>
{% endfor %}
</table>
<p class="engines">{{ result.engines|join(', ') }}</p>
</div>

View File

@ -0,0 +1,13 @@
<table class="result-table">
{% for key, value in result.items() %}
{% if key in ['engine', 'engines', 'template', 'score', 'category', 'positions'] %}
{% continue %}
{% endif %}
<tr>
<td><b>{{ key|upper }}</b>: {{ value|safe }}</td>
</tr>
{% endfor %}
<tr>
<td><b>ENGINES</b>: {{ result.engines|join(', ') }}</td>
</tr>
</table>

View File

@ -14,7 +14,7 @@
<!-- Draw result header --> <!-- Draw result header -->
{% macro result_header(result, favicons) -%} {% macro result_header(result, favicons) -%}
<h4 class="result_header">{% if result.engine~".png" in favicons %}{{ draw_favicon(result.engine) }} {% endif %}{{ result_link(result.url, result.title|safe) }}</h4> <h4 class="result_header">{% if result.engine~".png" in favicons %}{{ draw_favicon(result.engine) }} {% endif %}{% if result.url %}{{ result_link(result.url, result.title|safe) }}{% else %}{{ result.title|safe}}{% endif %}</h4>
{%- endmacro %} {%- endmacro %}
<!-- Draw result sub header --> <!-- Draw result sub header -->
@ -31,12 +31,16 @@
{% for engine in result.engines %} {% for engine in result.engines %}
<span class="label label-default">{{ engine }}</span> <span class="label label-default">{{ engine }}</span>
{% endfor %} {% endfor %}
{% if result.url %}
<small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info") }}</small> <small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info") }}</small>
{% endif %}
{% if proxify %} {% if proxify %}
<small>{{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info") }}</small> <small>{{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info") }}</small>
{% endif %} {% endif %}
</div> </div>
{% if result.pretty_url %}
<div class="external-link">{{ result.pretty_url }}</div> <div class="external-link">{{ result.pretty_url }}</div>
{% endif %}
{%- endmacro %} {%- endmacro %}
<!-- Draw result footer --> <!-- Draw result footer -->
@ -45,11 +49,15 @@
{% for engine in result.engines %} {% for engine in result.engines %}
<span class="label label-default">{{ engine }}</span> <span class="label label-default">{{ engine }}</span>
{% endfor %} {% endfor %}
{% if result.url %}
<small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info") }}</small> <small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info") }}</small>
{% endif %}
{% if proxify %} {% if proxify %}
<small>{{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info") }}</small> <small>{{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info") }}</small>
{% endif %} {% endif %}
{% if result.pretty_url %}
<div class="external-link">{{ result.pretty_url }}</div> <div class="external-link">{{ result.pretty_url }}</div>
{% endif %}
{%- endmacro %} {%- endmacro %}
{% macro preferences_item_header(info, label, rtl) -%} {% macro preferences_item_header(info, label, rtl) -%}

View File

@ -0,0 +1,19 @@
{% from 'oscar/macros.html' import result_footer, result_footer_rtl with context %}
<div class="panel panel-default">
<table class="table table-responsive table-bordered table-condensed">
{% for key, value in result.items() %}
{% if key in ['engine', 'engines', 'template', 'score', 'category', 'positions'] %}
{% continue %}
{% endif %}
<tr>
<td><b>{{ key|upper }}</b>: {{ value }}</td>
</tr>
{% endfor %}
</table>
{% if rtl %}
{{ result_footer_rtl(result) }}
{% else %}
{{ result_footer(result) }}
{% endif %}
</div>

View File

@ -0,0 +1,11 @@
<table>
{% for key, value in result.items() %}
{% if key in ['engine', 'engines', 'template', 'score', 'category', 'positions'] %}
{% continue %}
{% endif %}
<tr>
<td><b>{{ key|upper }}</b>: {{ value }}</td>
</tr>
{% endfor %}
</table>
<div class="engines">{% for engine in result.engines %}<span>{{ engine }}</span>{% endfor %}</div>{{- '' -}}

View File

@ -435,3 +435,18 @@ def ecma_unescape(s):
# "%20" becomes " ", "%F3" becomes "ó" # "%20" becomes " ", "%F3" becomes "ó"
s = ecma_unescape2_re.sub(lambda e: unichr(int(e.group(1), 16)), s) s = ecma_unescape2_re.sub(lambda e: unichr(int(e.group(1), 16)), s)
return s return s
def get_engine_from_settings(name):
"""Return engine configuration from settings.yml of a given engine name"""
if 'engines' not in settings:
return {}
for engine in settings['engines']:
if 'name' not in engine:
continue
if name == engine['name']:
return engine
return {}

View File

@ -124,6 +124,7 @@ app = Flask(
app.jinja_env.trim_blocks = True app.jinja_env.trim_blocks = True
app.jinja_env.lstrip_blocks = True app.jinja_env.lstrip_blocks = True
app.jinja_env.add_extension('jinja2.ext.loopcontrols')
app.secret_key = settings['server']['secret_key'] app.secret_key = settings['server']['secret_key']
if not searx_debug \ if not searx_debug \
@ -538,14 +539,16 @@ def index():
if output_format == 'html': if output_format == 'html':
if 'content' in result and result['content']: if 'content' in result and result['content']:
result['content'] = highlight_content(escape(result['content'][:1024]), search_query.query) result['content'] = highlight_content(escape(result['content'][:1024]), search_query.query)
result['title'] = highlight_content(escape(result['title'] or u''), search_query.query) if 'title' in result and result['title']:
result['title'] = highlight_content(escape(result['title'] or u''), search_query.query)
else: else:
if result.get('content'): if result.get('content'):
result['content'] = html_to_text(result['content']).strip() result['content'] = html_to_text(result['content']).strip()
# removing html content and whitespace duplications # removing html content and whitespace duplications
result['title'] = ' '.join(html_to_text(result['title']).strip().split()) result['title'] = ' '.join(html_to_text(result['title']).strip().split())
result['pretty_url'] = prettify_url(result['url']) if 'url' in result:
result['pretty_url'] = prettify_url(result['url'])
# TODO, check if timezone is calculated right # TODO, check if timezone is calculated right
if 'publishedDate' in result: if 'publishedDate' in result: