From 4d1770398a6af8902e75c0bd885781584d39e796 Mon Sep 17 00:00:00 2001 From: marc Date: Thu, 20 Jul 2017 15:47:20 -0500 Subject: [PATCH] remove 'all' option from search languages --- searx/engines/archlinux.py | 6 +- searx/engines/bing.py | 5 +- searx/engines/bing_news.py | 5 +- searx/engines/dailymotion.py | 5 +- searx/engines/duckduckgo.py | 12 +--- searx/engines/duckduckgo_images.py | 8 +-- searx/engines/faroo.py | 5 +- searx/engines/gigablast.py | 9 +-- searx/engines/google.py | 3 +- searx/engines/google_news.py | 5 +- searx/engines/mediawiki.py | 5 +- searx/engines/photon.py | 7 +-- searx/engines/qwant.py | 23 ++++--- searx/engines/startpage.py | 5 +- searx/engines/subtitleseeker.py | 2 +- searx/engines/swisscows.py | 5 +- searx/engines/twitter.py | 7 +-- searx/engines/wikidata.py | 4 -- searx/engines/wikipedia.py | 2 +- searx/engines/yacy.py | 4 +- searx/engines/yahoo.py | 4 +- searx/engines/yahoo_news.py | 5 +- searx/engines/youtube_api.py | 4 +- searx/preferences.py | 1 - searx/query.py | 34 ++++++----- searx/search.py | 6 +- searx/settings.yml | 2 +- searx/settings_robot.yml | 2 +- searx/templates/courgette/preferences.html | 1 - searx/templates/legacy/preferences.html | 1 - searx/templates/oscar/languages.html | 11 ++-- searx/templates/oscar/preferences.html | 4 +- searx/templates/pix-art/preferences.html | 1 - searx/webapp.py | 4 +- tests/unit/engines/test_archlinux.py | 2 +- tests/unit/engines/test_bing.py | 6 +- tests/unit/engines/test_bing_news.py | 4 -- tests/unit/engines/test_dailymotion.py | 4 -- tests/unit/engines/test_duckduckgo_images.py | 2 +- tests/unit/engines/test_faroo.py | 8 +-- tests/unit/engines/test_gigablast.py | 7 +-- tests/unit/engines/test_google.py | 2 +- tests/unit/engines/test_google_news.py | 7 +-- tests/unit/engines/test_mediawiki.py | 4 -- tests/unit/engines/test_qwant.py | 3 +- tests/unit/engines/test_startpage.py | 4 -- tests/unit/engines/test_subtitleseeker.py | 36 ----------- tests/unit/engines/test_swisscows.py | 5 -- tests/unit/engines/test_twitter.py | 6 -- tests/unit/engines/test_wikidata.py | 6 +- tests/unit/engines/test_wikipedia.py | 4 -- tests/unit/engines/test_yacy.py | 5 -- tests/unit/engines/test_yahoo.py | 7 --- tests/unit/engines/test_yahoo_news.py | 7 --- tests/unit/test_preferences.py | 15 +++-- tests/unit/test_query.py | 64 ++++++++++++++++++++ 56 files changed, 166 insertions(+), 249 deletions(-) create mode 100644 tests/unit/test_query.py diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py index cad06f8c..245bc50b 100644 --- a/searx/engines/archlinux.py +++ b/searx/engines/archlinux.py @@ -26,7 +26,7 @@ xpath_results = '//ul[@class="mw-search-results"]/li' xpath_link = './/div[@class="mw-search-result-heading"]/a' -# cut 'en' from 'en_US', 'de' from 'de_CH', and so on +# cut 'en' from 'en-US', 'de' from 'de-CH', and so on def locale_to_lang_code(locale): if locale.find('-') >= 0: locale = locale.split('-')[0] @@ -36,7 +36,7 @@ def locale_to_lang_code(locale): # wikis for some languages were moved off from the main site, we need to make # requests to correct URLs to be able to get results in those languages lang_urls = { - 'all': { + 'en': { 'base': 'https://wiki.archlinux.org', 'search': '/index.php?title=Special:Search&offset={offset}&{query}' }, @@ -67,7 +67,7 @@ lang_urls = { def get_lang_urls(language): if language in lang_urls: return lang_urls[language] - return lang_urls['all'] + return lang_urls['en'] # Language names to build search requests for diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 052d567e..2e58d029 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -32,10 +32,7 @@ search_string = 'search?{query}&first={offset}' def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 - if params['language'] != 'all': - lang = params['language'].split('-')[0].upper() - else: - lang = 'EN' + lang = params['language'].split('-')[0].upper() query = u'language:{} {}'.format(lang, query.decode('utf-8')).encode('utf-8') diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index b999b2a3..c609a194 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -71,10 +71,7 @@ def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 - if params['language'] == 'all': - language = 'en-US' - else: - language = params['language'] + language = params['language'] params['url'] = _get_url(query, language, offset, params['time_range']) diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py index fad7e596..cfa76796 100644 --- a/searx/engines/dailymotion.py +++ b/searx/engines/dailymotion.py @@ -32,10 +32,7 @@ supported_languages_url = 'https://api.dailymotion.com/languages' # do search-request def request(query, params): - if params['language'] == 'all': - locale = 'en-US' - else: - locale = params['language'] + locale = params['language'] params['url'] = search_url.format( query=urlencode({'search': query, 'localization': locale}), diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 921e29f8..6f8797fe 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -44,9 +44,7 @@ content_xpath = './/a[@class="result__snippet"]' # match query's language to a region code that duckduckgo will accept def get_region_code(lang, lang_list=None): # custom fixes for languages - if lang == 'all': - region_code = None - elif lang[:2] == 'ja': + if lang[:2] == 'ja': region_code = 'jp-jp' elif lang[:2] == 'sl': region_code = 'sl-sl' @@ -82,12 +80,8 @@ def request(query, params): offset = (params['pageno'] - 1) * 30 region_code = get_region_code(params['language']) - if region_code: - params['url'] = url.format( - query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset) - else: - params['url'] = url.format( - query=urlencode({'q': query}), offset=offset, dc_param=offset) + params['url'] = url.format( + query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset) if params['time_range'] in time_range_dict: params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py index dbd78b07..7b0e7269 100644 --- a/searx/engines/duckduckgo_images.py +++ b/searx/engines/duckduckgo_images.py @@ -53,12 +53,8 @@ def request(query, params): safesearch = params['safesearch'] - 1 region_code = get_region_code(params['language'], lang_list=supported_languages) - if region_code: - params['url'] = images_url.format( - query=urlencode({'q': query, 'l': region_code}), offset=offset, safesearch=safesearch, vqd=vqd) - else: - params['url'] = images_url.format( - query=urlencode({'q': query}), offset=offset, safesearch=safesearch, vqd=vqd) + params['url'] = images_url.format( + query=urlencode({'q': query, 'l': region_code}), offset=offset, safesearch=safesearch, vqd=vqd) return params diff --git a/searx/engines/faroo.py b/searx/engines/faroo.py index 7ce3a6ce..4e8b5674 100644 --- a/searx/engines/faroo.py +++ b/searx/engines/faroo.py @@ -40,10 +40,7 @@ def request(query, params): offset = (params['pageno'] - 1) * number_of_results + 1 categorie = search_category.get(params['category'], 'web') - if params['language'] == 'all': - language = 'en' - else: - language = params['language'].split('_')[0] + language = params['language'].split('-')[0] # if language is not supported, put it in english if language != 'en' and\ diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index 2bdc97fd..99791a42 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -49,12 +49,9 @@ supported_languages_url = 'https://gigablast.com/search?&rxikd=1' def request(query, params): offset = (params['pageno'] - 1) * number_of_results - if params['language'] == 'all': - language = 'xx' - else: - language = params['language'].replace('-', '_').lower() - if language.split('-')[0] != 'zh': - language = language.split('-')[0] + language = params['language'].replace('-', '_').lower() + if language.split('-')[0] != 'zh': + language = language.split('-')[0] if params['safesearch'] >= 1: safesearch = 1 diff --git a/searx/engines/google.py b/searx/engines/google.py index 363b7fb6..47a560aa 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -165,7 +165,8 @@ def extract_text_from_dom(result, xpath): def request(query, params): offset = (params['pageno'] - 1) * 10 - if params['language'] == 'all': + # temporary fix until a way of supporting en-US is found + if params['language'] == 'en-US': params['language'] = 'en-GB' if params['language'][:2] == 'jv': diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 8881d0da..8b8e7175 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -50,9 +50,8 @@ def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), search_options=urlencode(search_options)) - if params['language'] != 'all': - language_array = params['language'].lower().split('-') - params['url'] += '&lr=lang_' + language_array[0] + language_array = params['language'].lower().split('-') + params['url'] += '&lr=lang_' + language_array[0] return params diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py index 0607ac93..c7b05ffc 100644 --- a/searx/engines/mediawiki.py +++ b/searx/engines/mediawiki.py @@ -45,10 +45,7 @@ def request(query, params): format_strings = list(Formatter().parse(base_url)) - if params['language'] == 'all': - language = 'en' - else: - language = params['language'].split('-')[0] + language = params['language'].split('-')[0] # format_string [('https://', 'language', '', None), ('.wikipedia.org/', None, None, None)] if any(x[1] == 'language' for x in format_strings): diff --git a/searx/engines/photon.py b/searx/engines/photon.py index 15236f68..24084195 100644 --- a/searx/engines/photon.py +++ b/searx/engines/photon.py @@ -35,10 +35,9 @@ def request(query, params): search_string.format(query=urlencode({'q': query}), limit=number_of_results) - if params['language'] != 'all': - language = params['language'].split('_')[0] - if language in supported_languages: - params['url'] = params['url'] + "&lang=" + language + language = params['language'].split('-')[0] + if language in supported_languages: + params['url'] = params['url'] + "&lang=" + language # using searx User-Agent params['headers']['User-Agent'] = searx_useragent() diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index 3d266e22..408c2b3d 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -44,18 +44,17 @@ def request(query, params): query=urlencode({'q': query}), offset=offset) - # add language tag if specified - if params['language'] != 'all': - if params['language'] == 'no' or params['language'].startswith('no-'): - params['language'] = params['language'].replace('no', 'nb', 1) - if params['language'].find('-') < 0: - # tries to get a country code from language - for lang in supported_languages: - lc = lang.split('-') - if params['language'] == lc[0]: - params['language'] = lang - break - params['url'] += '&locale=' + params['language'].replace('-', '_').lower() + # add language tag + if params['language'] == 'no' or params['language'].startswith('no-'): + params['language'] = params['language'].replace('no', 'nb', 1) + if params['language'].find('-') < 0: + # tries to get a country code from language + for lang in supported_languages: + lc = lang.split('-') + if params['language'] == lc[0]: + params['language'] = lang + break + params['url'] += '&locale=' + params['language'].replace('-', '_').lower() return params diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 314b7b9a..3e067597 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -45,9 +45,8 @@ def request(query, params): params['data'] = {'query': query, 'startat': offset} - # set language if specified - if params['language'] != 'all': - params['data']['with_language'] = ('lang_' + params['language'].split('-')[0]) + # set language + params['data']['with_language'] = ('lang_' + params['language'].split('-')[0]) return params diff --git a/searx/engines/subtitleseeker.py b/searx/engines/subtitleseeker.py index 2cbc991b..118504ff 100644 --- a/searx/engines/subtitleseeker.py +++ b/searx/engines/subtitleseeker.py @@ -48,7 +48,7 @@ def response(resp): search_lang = 'Farsi' elif resp.search_params['language'] == 'pt-BR': search_lang = 'Brazilian' - elif resp.search_params['language'] != 'all': + else: search_lang = [lc[3] for lc in language_codes if lc[0].split('-')[0] == resp.search_params['language'].split('-')[0]] diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py index 00346a7d..45e9d87a 100644 --- a/searx/engines/swisscows.py +++ b/searx/engines/swisscows.py @@ -35,10 +35,7 @@ regex_img_url_remove_start = re.compile(b'^https?://i\.swisscows\.ch/\?link=') # do search-request def request(query, params): - if params['language'] == 'all': - ui_language = 'browser' - region = 'browser' - elif params['language'].split('-')[0] == 'no': + if params['language'].split('-')[0] == 'no': region = 'nb-NO' else: region = params['language'] diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py index d2a8d208..8641167d 100644 --- a/searx/engines/twitter.py +++ b/searx/engines/twitter.py @@ -37,12 +37,7 @@ timestamp_xpath = './/span[contains(@class,"_timestamp")]' # do search-request def request(query, params): params['url'] = search_url + urlencode({'q': query}) - - # set language if specified - if params['language'] != 'all': - params['cookies']['lang'] = params['language'].split('-')[0] - else: - params['cookies']['lang'] = 'en' + params['cookies']['lang'] = params['language'].split('-')[0] return params diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index be217463..1f31a1f8 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -57,8 +57,6 @@ calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]' def request(query, params): language = params['language'].split('-')[0] - if language == 'all': - language = 'en' params['url'] = url_search.format( query=urlencode({'label': query, 'language': language})) @@ -71,8 +69,6 @@ def response(resp): wikidata_ids = html.xpath(wikidata_ids_xpath) language = resp.search_params['language'].split('-')[0] - if language == 'all': - language = 'en' # TODO: make requests asynchronous to avoid timeout when result_count > 1 for wikidata_id in wikidata_ids[:result_count]: diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index db2fdc00..fe82f511 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -31,7 +31,7 @@ supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' # set language in base_url def url_lang(lang): lang = lang.split('-')[0] - if lang == 'all' or lang not in supported_languages: + if lang not in supported_languages: language = 'en' else: language = lang diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py index a62a1296..c19140bb 100644 --- a/searx/engines/yacy.py +++ b/searx/engines/yacy.py @@ -51,9 +51,7 @@ def request(query, params): limit=number_of_results, search_type=search_type) - # add language tag if specified - if params['language'] != 'all': - params['url'] += '&lr=lang_' + params['language'].split('-')[0] + params['url'] += '&lr=lang_' + params['language'].split('-')[0] return params diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index 5387aaf5..626a398b 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -71,9 +71,7 @@ def _get_url(query, offset, language, time_range): def _get_language(params): - if params['language'] == 'all': - return 'en' - elif params['language'][:2] == 'zh': + if params['language'][:2] == 'zh': if params['language'] == 'zh' or params['language'] == 'zh-CH': return 'szh' else: diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py index ae54a4ac..69e9aef4 100644 --- a/searx/engines/yahoo_news.py +++ b/searx/engines/yahoo_news.py @@ -38,10 +38,7 @@ suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a' def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 - if params['language'] == 'all': - language = 'en' - else: - language = params['language'].split('_')[0] + language = params['language'].split('-')[0] params['url'] = search_url.format(offset=offset, query=urlencode({'p': query}), diff --git a/searx/engines/youtube_api.py b/searx/engines/youtube_api.py index 6de18aa2..f8bc353f 100644 --- a/searx/engines/youtube_api.py +++ b/searx/engines/youtube_api.py @@ -34,9 +34,7 @@ def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), api_key=api_key) - # add language tag if specified - if params['language'] != 'all': - params['url'] += '&relevanceLanguage=' + params['language'].split('-')[0] + params['url'] += '&relevanceLanguage=' + params['language'].split('-')[0] return params diff --git a/searx/preferences.py b/searx/preferences.py index dde4f098..2faa4269 100644 --- a/searx/preferences.py +++ b/searx/preferences.py @@ -12,7 +12,6 @@ if version[0] == '3': COOKIE_MAX_AGE = 60 * 60 * 24 * 365 * 5 # 5 years LANGUAGE_CODES = [l[0] for l in languages] -LANGUAGE_CODES.append('all') DISABLED = 0 ENABLED = 1 DOI_RESOLVERS = list(settings['doi_resolvers']) diff --git a/searx/query.py b/searx/query.py index e4483f18..6e5f2e88 100644 --- a/searx/query.py +++ b/searx/query.py @@ -73,11 +73,6 @@ class RawTextQuery(object): if query_part[0] == ':': lang = query_part[1:].lower().replace('_', '-') - # user may set a valid, yet not selectable language - if VALID_LANGUAGE_CODE.match(lang): - self.languages.append(lang) - parse_next = True - # check if any language-code is equal with # declared language-codes for lc in language_codes: @@ -85,16 +80,25 @@ class RawTextQuery(object): # if correct language-code is found # set it as new search-language - if lang == lang_id\ - or lang_id.startswith(lang)\ - or lang == lang_name\ - or lang == english_name\ - or lang.replace('-', ' ') == country: - parse_next = True - self.languages.append(lang_id) - # to ensure best match (first match is not necessarily the best one) - if lang == lang_id: - break + if (lang == lang_id + or lang == lang_name + or lang == english_name + or lang.replace('-', ' ') == country)\ + and lang not in self.languages: + parse_next = True + lang_parts = lang_id.split('-') + if len(lang_parts) == 2: + self.languages.append(lang_parts[0] + '-' + lang_parts[1].upper()) + else: + self.languages.append(lang_id) + # to ensure best match (first match is not necessarily the best one) + if lang == lang_id: + break + + # user may set a valid, yet not selectable language + if not self.languages and VALID_LANGUAGE_CODE.match(lang): + self.languages.append(lang) + parse_next = True # this force a engine or category if query_part[0] == '!' or query_part[0] == '?': diff --git a/searx/search.py b/searx/search.py index 1bfb4726..b523c275 100644 --- a/searx/search.py +++ b/searx/search.py @@ -24,7 +24,7 @@ from flask_babel import gettext import requests.exceptions import searx.poolrequests as requests_lib from searx.engines import ( - categories, engines + categories, engines, settings ) from searx.answerers import ask from searx.utils import gen_useragent @@ -220,6 +220,10 @@ def get_search_query_from_webapp(preferences, form): else: query_lang = preferences.get_value('language') + # provides backwards compatibility for requests using old language default + if query_lang == 'all': + query_lang = settings['search']['language'] + # check language if not VALID_LANGUAGE_CODE.match(query_lang): raise SearxParameterException('language', query_lang) diff --git a/searx/settings.yml b/searx/settings.yml index 00cac5fe..e819eada 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -5,7 +5,7 @@ general: search: safe_search : 0 # Filter results. 0: None, 1: Moderate, 2: Strict autocomplete : "" # Existing autocomplete backends: "dbpedia", "duckduckgo", "google", "startpage", "wikipedia" - leave blank to turn it off by default - language : "all" + language : "en-US" server: port : 8888 diff --git a/searx/settings_robot.yml b/searx/settings_robot.yml index 070a0edb..e5f163ae 100644 --- a/searx/settings_robot.yml +++ b/searx/settings_robot.yml @@ -5,7 +5,7 @@ general: search: safe_search : 0 autocomplete : "" - language: "all" + language: "en-US" server: port : 11111 diff --git a/searx/templates/courgette/preferences.html b/searx/templates/courgette/preferences.html index 56a6e020..61f52147 100644 --- a/searx/templates/courgette/preferences.html +++ b/searx/templates/courgette/preferences.html @@ -13,7 +13,6 @@ {{ _('Search language') }}

- {% for lang_id,lang_name,country_name,english_name in language_codes | sort(attribute=1) %} {% endfor %} diff --git a/searx/templates/oscar/languages.html b/searx/templates/oscar/languages.html index 96c1c3a9..996c427b 100644 --- a/searx/templates/oscar/languages.html +++ b/searx/templates/oscar/languages.html @@ -3,10 +3,9 @@ {% else %} diff --git a/searx/templates/oscar/preferences.html b/searx/templates/oscar/preferences.html index ac62dc93..89eb99c1 100644 --- a/searx/templates/oscar/preferences.html +++ b/searx/templates/oscar/preferences.html @@ -187,7 +187,7 @@ {{ search_engine.name }} {{ shortcuts[search_engine.name] }} - {{ support_toggle(current_language == 'all' or current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages) }} + {{ support_toggle(current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages) }} {{ support_toggle(search_engine.safesearch==True) }} {{ support_toggle(search_engine.time_range_support==True) }} {{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }} @@ -197,7 +197,7 @@ {{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }} {{ support_toggle(search_engine.time_range_support==True) }} {{ support_toggle(search_engine.safesearch==True) }} - {{ support_toggle(current_language == 'all' or current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages) }} + {{ support_toggle(current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages) }} {{ shortcuts[search_engine.name] }} {{ search_engine.name }} diff --git a/searx/templates/pix-art/preferences.html b/searx/templates/pix-art/preferences.html index 05876ded..0b2bb670 100644 --- a/searx/templates/pix-art/preferences.html +++ b/searx/templates/pix-art/preferences.html @@ -9,7 +9,6 @@ {{ _('Search language') }}