[mod] fetch supported languages for several engines

utils/fetch_languages.py gets languages supported by each engine and generates engines_languages.json with each engine's supported language.
2016-11-05 20:51:38 -06:00 · 2016-11-05 20:51:38 -06:00 · f62ce21f50
parent 92c6e88ad3
commit f62ce21f50
26 changed files with 3633 additions and 362 deletions
--- a/searx/data/engines_languages.json
+++ b/searx/data/engines_languages.json
--- a/searx/engines/init.py
+++ b/searx/engines/init.py
@ -20,6 +20,7 @@ from os.path import realpath, dirname
 import sys
 from flask_babel import gettext
 from operator import itemgetter
 from json import loads
 from searx import settings
 from searx import logger
 from searx.utils import load_module
@ -78,6 +79,9 @@ def load_engine(engine_data):
        if not hasattr(engine, arg_name):
            setattr(engine, arg_name, arg_value)
    if engine_data['name'] in languages:
        setattr(engine, 'supported_languages', languages[engine_data['name']])
    # checking required variables
    for engine_attr in dir(engine):
        if engine_attr.startswith('_'):
@ -207,6 +211,8 @@ if 'engines' not in settings or not settings['engines']:
    logger.error('No engines found. Edit your settings.yml')
    exit(2)
 languages = loads(open(engine_dir + '/../data/engines_languages.json').read())
 for engine_data in settings['engines']:
    engine = load_engine(engine_data)
    if engine is not None:
--- a/searx/engines/bing.py
+++ b/searx/engines/bing.py
@ -15,12 +15,14 @@
 from urllib import urlencode
 from lxml import html
 from requests import get
 from searx.engines.xpath import extract_text
 # engine dependent config
 categories = ['general']
 paging = True
 language_support = True
 supported_languages_url = 'https://www.bing.com/account/general'
 # search-url
 base_url = 'https://www.bing.com/'
@ -81,3 +83,16 @@ def response(resp):
    # return results
    return results
 # get supported languages from their site
 def fetch_supported_languages():
    supported_languages = []
    response = get(supported_languages_url)
    dom = html.fromstring(response.text)
    options = dom.xpath('//div[@id="limit-languages"]//input')
    for option in options:
        code = option.xpath('./@id')[0].replace('_', '-')
        supported_languages.append(code)
    return supported_languages
--- a/searx/engines/bing_images.py
+++ b/searx/engines/bing_images.py
@ -19,7 +19,7 @@ from urllib import urlencode
 from lxml import html
 from json import loads
 import re
-from searx.engines.bing import supported_languages
+from searx.engines.bing import fetch_supported_languages
 # engine dependent config
 categories = ['images']
--- a/searx/engines/bing_news.py
+++ b/searx/engines/bing_news.py
@ -17,7 +17,7 @@ from datetime import datetime
 from dateutil import parser
 from lxml import etree
 from searx.utils import list_get
-from searx.engines.bing import supported_languages
+from searx.engines.bing import fetch_supported_languages
 # engine dependent config
 categories = ['news']
--- a/searx/engines/dailymotion.py
+++ b/searx/engines/dailymotion.py
@ -15,29 +15,12 @@
 from urllib import urlencode
 from json import loads
 from datetime import datetime
 from requests import get
 # engine dependent config
 categories = ['videos']
 paging = True
 language_support = True
 supported_languages = ["af", "ak", "am", "ar", "an", "as", "av", "ae", "ay", "az",
                       "ba", "bm", "be", "bn", "bi", "bo", "bs", "br", "bg", "ca",
                       "cs", "ch", "ce", "cu", "cv", "kw", "co", "cr", "cy", "da",
                       "de", "dv", "dz", "el", "en", "eo", "et", "eu", "ee", "fo",
                       "fa", "fj", "fi", "fr", "fy", "ff", "gd", "ga", "gl", "gv",
                       "gn", "gu", "ht", "ha", "sh", "he", "hz", "hi", "ho", "hr",
                       "hu", "hy", "ig", "io", "ii", "iu", "ie", "ia", "id", "ik",
                       "is", "it", "jv", "ja", "kl", "kn", "ks", "ka", "kr", "kk",
                       "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo",
                       "la", "lv", "li", "ln", "lt", "lb", "lu", "lg", "mh", "ml",
                       "mr", "mk", "mg", "mt", "mn", "mi", "ms", "my", "na", "nv",
                       "nr", "nd", "ng", "ne", "nl", "nn", "nb", "no", "ny", "oc",
                       "oj", "or", "om", "os", "pa", "pi", "pl", "pt", "ps", "qu",
                       "rm", "ro", "rn", "ru", "sg", "sa", "si", "sk", "sl", "se",
                       "sm", "sn", "sd", "so", "st", "es", "sq", "sc", "sr", "ss",
                       "su", "sw", "sv", "ty", "ta", "tt", "te", "tg", "tl", "th",
                       "ti", "to", "tn", "ts", "tk", "tr", "tw", "ug", "uk", "ur",
                       "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", "yo", "za", "zh", "zu"]
 # search-url
 # see http://www.dailymotion.com/doc/api/obj-video.html
@ -45,6 +28,8 @@ search_url = 'https://api.dailymotion.com/videos?fields=created_time,title,descr
 embedded_url = '<iframe frameborder="0" width="540" height="304" ' +\
    'data-src="//www.dailymotion.com/embed/video/{videoid}" allowfullscreen></iframe>'
 supported_languages_url = 'https://api.dailymotion.com/languages'
 # do search-request
 def request(query, params):
@ -92,3 +77,23 @@ def response(resp):
    # return results
    return results
 # get supported languages from their site
 def fetch_supported_languages():
    supported_languages = {}
    response = get(supported_languages_url)
    response_json = loads(response.text)
    for language in response_json['list']:
        supported_languages[language['code']] = {}
        name = language['native_name']
        if name:
            supported_languages[language['code']]['name'] = name
        english_name = language['name']
        if english_name:
            supported_languages[language['code']]['english_name'] = english_name
    return supported_languages
--- a/searx/engines/duckduckgo.py
+++ b/searx/engines/duckduckgo.py
@ -15,19 +15,15 @@
 from urllib import urlencode
 from lxml.html import fromstring
 from requests import get
 from json import loads
 from searx.engines.xpath import extract_text
 # engine dependent config
 categories = ['general']
 paging = True
 language_support = True
-supported_languages = ["es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA", "ca-CT",
+supported_languages_url = 'https://duckduckgo.com/d2030.js'
                       "es-CL", "zh-CN", "es-CO", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE",
                       "el-GR", "tzh-HK", "hu-HU", "en-IN", "id-ID", "en-ID", "en-IE", "he-IL", "it-IT", "jp-JP",
                       "kr-KR", "es-XL", "lv-LV", "lt-LT", "ms-MY", "en-MY", "es-MX", "nl-NL", "en-NZ", "no-NO",
                       "es-PE", "en-PH", "tl-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU", "ar-XA", "en-XA", "en-SG",
                       "sk-SK", "sl-SL", "en-ZA", "es-ES", "ca-ES", "sv-SE", "de-CH", "fr-CH", "it-CH", "tzh-TW",
                       "th-TH", "tr-TR", "uk-UA", "en-UK", "en-US", "es-US", "vi-VN"]
 time_range_support = True
 # search-url
@ -65,8 +61,6 @@ def request(query, params):
        locale = 'xa' + params['language'].split('-')[0]
    elif params['language'][-2:] == 'GB':
        locale = 'uk' + params['language'].split('-')[0]
    elif params['language'] == 'es-419':
        locale = 'xl-es'
    else:
        locale = params['language'].split('-')
        if len(locale) == 2:
@ -120,3 +114,18 @@ def response(resp):
    # return results
    return results
 # get supported languages from their site
 def fetch_supported_languages():
    response = get(supported_languages_url)
    # response is a js file with regions as an embedded object
    response_page = response.text
    response_page = response_page[response_page.find('regions:{') + 8:]
    response_page = response_page[:response_page.find('}') + 1]
    regions_json = loads(response_page)
    supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())
    return supported_languages
--- a/searx/engines/duckduckgo_definitions.py
+++ b/searx/engines/duckduckgo_definitions.py
@ -4,7 +4,7 @@ from re import compile, sub
 from lxml import html
 from searx.utils import html_to_text
 from searx.engines.xpath import extract_text
-from searx.engines.duckduckgo import supported_languages
+from searx.engines.duckduckgo import fetch_supported_languages
 url = 'https://api.duckduckgo.com/'\
    + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
--- a/searx/engines/gigablast.py
+++ b/searx/engines/gigablast.py
@ -14,6 +14,8 @@ from json import loads
 from random import randint
 from time import time
 from urllib import urlencode
 from requests import get
 from lxml.html import fromstring
 # engine dependent config
 categories = ['general']
@ -40,11 +42,7 @@ url_xpath = './/url'
 title_xpath = './/title'
 content_xpath = './/sum'
-supported_languages = ["en", "fr", "es", "ru", "tr", "ja", "zh-CN", "zh-TW", "ko", "de",
+supported_languages_url = 'https://gigablast.com/search?&rxikd=1'
                       "nl", "it", "fi", "sv", "no", "pt", "vi", "ar", "he", "id", "el",
                       "th", "hi", "bn", "pl", "tl", "la", "eo", "ca", "bg", "tx", "sr",
                       "hu", "da", "lt", "cs", "gl", "ka", "gd", "go", "ro", "ga", "lv",
                       "hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"]
 # do search-request
@ -90,3 +88,17 @@ def response(resp):
    # return results
    return results
 # get supported languages from their site
 def fetch_supported_languages():
    supported_languages = []
    response = get(supported_languages_url)
    dom = fromstring(response.text)
    links = dom.xpath('//span[@id="menu2"]/a')
    for link in links:
        code = link.xpath('./@href')[0][-2:]
        if code != 'xx' and code not in supported_languages:
            supported_languages.append(code)
    return supported_languages
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@ -12,6 +12,7 @@ import re
 from urllib import urlencode
 from urlparse import urlparse, parse_qsl
 from lxml import html, etree
 from requests import get
 from searx.engines.xpath import extract_text, extract_url
 from searx.search import logger
@ -23,20 +24,6 @@ categories = ['general']
 paging = True
 language_support = True
 use_locale_domain = True
 supported_languages = ["ach", "af", "ak", "az", "ms", "ban", "xx-bork", "bs", "br", "ca",
                       "ceb", "ckb", "cs", "sn", "co", "cy", "da", "de", "yo", "et",
                       "xx-elmer", "en", "es", "es-419", "eo", "eu", "ee", "tl", "fo", "fr",
                       "gaa", "ga", "gd", "gl", "gn", "xx-hacker", "ht", "ha", "hr", "haw",
                       "bem", "ig", "rn", "id", "ia", "zu", "is", "it", "jw", "rw", "sw",
                       "tlh", "kg", "mfe", "kri", "la", "lv", "to", "lt", "ln", "loz",
                       "lua", "lg", "hu", "mg", "mt", "mi", "nl", "pcm", "no", "nso",
                       "ny", "nn", "uz", "oc", "om", "xx-pirate", "pl", "pt-BR", "pt-PT",
                       "ro", "rm", "qu", "nyn", "crs", "sq", "sd", "sk", "sl", "so", "st",
                       "sr-ME", "sr-Latn", "su", "fi", "sv", "tg", "tt", "vi", "tn", "tum",
                       "tr", "tk", "tw", "fy", "wo", "xh", "el", "be", "bg", "ky", "kk", "mk",
                       "mn", "ru", "sr", "uk", "ka", "hy", "yi", "iw", "ug", "ur", "ar", "ps",
                       "fa", "ti", "am", "ne", "mr", "hi", "bn", "pa", "gu", "or", "ta", "te",
                       "kn", "ml", "si", "th", "lo", "my", "km", "chr", "ko", "zh-CN", "zh-TW", "ja"]
 time_range_support = True
 # based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests
@ -117,6 +104,7 @@ map_hostname_start = 'maps.google.'
 maps_path = '/maps'
 redirect_path = '/url'
 images_path = '/images'
 supported_languages_url = 'https://www.google.com/preferences?#languages'
 # specific xpath variables
 results_xpath = '//div[@class="g"]'
@ -373,3 +361,17 @@ def attributes_to_html(attributes):
        retval = retval + '<tr><th>' + a.get('label') + '</th><td>' + value + '</td></tr>'
    retval = retval + '</table>'
    return retval
 # get supported languages from their site
 def fetch_supported_languages():
    supported_languages = {}
    response = get(supported_languages_url)
    dom = html.fromstring(response.text)
    options = dom.xpath('//select[@name="hl"]/option')
    for option in options:
        code = option.xpath('./@value')[0].split('-')[0]
        name = option.text[:-1].title()
        supported_languages[code] = {"name": name}
    return supported_languages
--- a/searx/engines/google_news.py
+++ b/searx/engines/google_news.py
@ -13,7 +13,7 @@
 from lxml import html
 from urllib import urlencode
 from json import loads
-from searx.engines.google import supported_languages
+from searx.engines.google import fetch_supported_languages
 # search-url
 categories = ['news']
--- a/searx/engines/mediawiki.py
+++ b/searx/engines/mediawiki.py
@ -15,7 +15,6 @@
 from json import loads
 from string import Formatter
 from urllib import urlencode, quote
 from searx.engines.wikipedia import supported_languages
 # engine dependent config
 categories = ['general']
--- a/searx/engines/qwant.py
+++ b/searx/engines/qwant.py
@ -20,11 +20,6 @@ from searx.utils import html_to_text
 categories = None
 paging = True
 language_support = True
 supported_languages = ["fr-FR", "de-DE", "en-GB", "it-IT", "es-ES", "pt-PT", "de-CH", "fr-CH", "it-CH", "de-AT",
                       "fr-BE", "nl-BE", "nl-NL", "da-DK", "fi-FI", "sv-SE", "en-IE", "no-NO", "pl-PL", "ru-RU",
                       "el-GR", "bg-BG", "cs-CZ", "et-EE", "hu-HU", "ro-RO", "en-US", "en-CA", "fr-CA", "pt-BR",
                       "es-AR", "es-CL", "es-MX", "ja-JP", "en-SG", "en-IN", "en-MY", "ms-MY", "ko-KR", "tl-PH",
                       "th-TH", "he-IL", "tr-TR", "en-AU", "en-NZ"]
 category_to_keyword = {'general': 'web',
                       'images': 'images',
@ -51,15 +46,7 @@ def request(query, params):
    # add language tag if specified
    if params['language'] != 'all':
        locale = params['language'].split('-')
        if len(locale) == 2 and params['language'] in supported_languages:
        params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
        else:
            # try to get a country code for language
            for lang in supported_languages:
                if locale[0] == lang.split('-')[0]:
                    params['url'] += '&locale=' + lang.replace('-', '_').lower()
                    break
    return params
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@ -24,11 +24,6 @@ categories = ['general']
 # paging = False
 language_support = True
 supported_languages = ["af", "de", "ar", "hy", "be", "bg", "ca", "cs", "zh-CN", "zh-TW",
                       "ko", "hr", "da", "sk", "sl", "es", "eo", "et", "fi", "fr",
                       "el", "iw", "hi", "nl", "hu", "id", "en", "is", "it", "ja",
                       "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sw",
                       "sv", "tl", "th", "tr", "uk", "vi"]
 # search-url
 base_url = 'https://startpage.com/'
--- a/searx/engines/subtitleseeker.py
+++ b/searx/engines/subtitleseeker.py
@ -22,7 +22,7 @@ language = ""
 # search-url
 url = 'http://www.subtitleseeker.com/'
-search_url = url + 'search/TITLES/{query}&p={pageno}'
+search_url = url + 'search/TITLES/{query}?p={pageno}'
 # specific xpath variables
 results_xpath = '//div[@class="boxRows"]'
@ -51,7 +51,8 @@ def response(resp):
    elif resp.search_params['language'] != 'all':
        search_lang = [lc[3]
                       for lc in language_codes
-                       if lc[0][:2] == resp.search_params['language'].split('_')[0]][0]
+                       if lc[0].split('-')[0] == resp.search_params['language'].split('-')[0]]
        search_lang = search_lang[0].split(' (')[0]
    # parse results
    for result in dom.xpath(results_xpath):
--- a/searx/engines/swisscows.py
+++ b/searx/engines/swisscows.py
@ -13,17 +13,13 @@
 from json import loads
 from urllib import urlencode, unquote
 import re
 from requests import get
 from lxml.html import fromstring
 # engine dependent config
 categories = ['general', 'images']
 paging = True
 language_support = True
 supported_languages = ["ar-SA", "es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA",
                       "es-CL", "zh-CN", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE", "el-GR",
                       "zh-HK", "hu-HU", "en-IN", "en-IE", "he-IL", "it-IT", "ja-JP", "ko-KR", "lv-LV", "lt-LT",
                       "en-MY", "es-MX", "nl-NL", "en-NZ", "nb-NO", "en-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU",
                       "en-SG", "sk-SK", "sl-SI", "en-ZA", "es-ES", "sv-SE", "de-CH", "fr-CH", "zh-TW", "th-TH",
                       "tr-TR", "uk-UA", "en-GB", "en-US", "es-US"]
 # search-url
 base_url = 'https://swisscows.ch/'
@ -114,3 +110,16 @@ def response(resp):
    # return results
    return results
 # get supported languages from their site
 def fetch_supported_languages():
    supported_languages = []
    response = get(base_url)
    dom = fromstring(response.text)
    options = dom.xpath('//div[@id="regions-popup"]//ul/li/a')
    for option in options:
        code = option.xpath('./@data-val')[0]
        supported_languages.append(code)
    return supported_languages
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@ -15,7 +15,7 @@ from searx import logger
 from searx.poolrequests import get
 from searx.engines.xpath import extract_text
 from searx.utils import format_date_by_locale
-from searx.engines.wikipedia import supported_languages
+from searx.engines.wikipedia import fetch_supported_languages
 from json import loads
 from lxml.html import fromstring
@ -57,7 +57,7 @@ calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
 def request(query, params):
-    language = params['language'].split('_')[0]
+    language = params['language'].split('-')[0]
    if language == 'all':
        language = 'en'
@ -72,7 +72,7 @@ def response(resp):
    html = fromstring(resp.content)
    wikidata_ids = html.xpath(wikidata_ids_xpath)
-    language = resp.search_params['language'].split('_')[0]
+    language = resp.search_params['language'].split('-')[0]
    if language == 'all':
        language = 'en'
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@ -12,36 +12,9 @@
 from json import loads
 from urllib import urlencode, quote
 from requests import get
 from lxml.html import fromstring
 supported_languages = ["en", "sv", "ceb", "de", "nl", "fr", "ru", "it", "es", "war",
                       "pl", "vi", "ja", "pt", "zh", "uk", "ca", "fa", "no", "sh",
                       "ar", "fi", "hu", "id", "ro", "cs", "ko", "sr", "ms", "tr",
                       "eu", "eo", "min", "bg", "da", "kk", "sk", "hy", "he", "zh-min-nan",
                       "lt", "hr", "sl", "et", "ce", "gl", "nn", "uz", "la", "vo",
                       "el", "simple", "be", "az", "th", "ur", "ka", "hi", "oc", "ta",
                       "mk", "mg", "new", "lv", "cy", "bs", "tt", "tl", "te", "pms",
                       "be-tarask", "br", "sq", "ky", "ht", "jv", "tg", "ast", "zh-yue", "lb",
                       "mr", "ml", "bn", "pnb", "is", "af", "sco", "ga", "ba", "fy",
                       "cv", "lmo", "sw", "my", "an", "yo", "ne", "io", "gu", "nds",
                       "scn", "bpy", "pa", "ku", "als", "kn", "bar", "ia", "qu", "su",
                       "ckb", "bat-smg", "mn", "arz", "nap", "wa", "bug", "gd", "yi", "map-bms",
                       "am", "mzn", "fo", "si", "nah", "li", "sah", "vec", "hsb", "or",
                       "os", "mrj", "sa", "hif", "mhr", "roa-tara", "azb", "pam", "ilo",
                       "sd", "ps", "se", "mi", "bh", "eml", "bcl", "xmf", "diq", "hak",
                       "gan", "glk", "vls", "nds-nl", "rue", "bo", "fiu-vro", "co", "sc",
                       "tk", "csb", "lrc", "vep", "wuu", "km", "szl", "gv", "crh", "kv",
                       "zh-classical", "frr", "zea", "as", "so", "kw", "nso", "ay", "stq",
                       "udm", "cdo", "nrm", "ie", "koi", "rm", "pcd", "myv", "mt", "fur",
                       "ace", "lad", "gn", "lij", "dsb", "dv", "cbk-zam", "ext", "gom",
                       "kab", "ksh", "ang", "mai", "mwl", "lez", "gag", "ln", "ug", "pi",
                       "pag", "frp", "sn", "nv", "av", "pfl", "haw", "xal", "krc", "kaa",
                       "rw", "bxr", "pdc", "to", "kl", "nov", "arc", "kbd", "lo", "bjn",
                       "pap", "ha", "tet", "ki", "tyv", "tpi", "na", "lbe", "ig", "jbo",
                       "roa-rup", "ty", "jam", "za", "kg", "mdf", "lg", "wo", "srn", "ab",
                       "ltg", "zu", "sm", "chr", "om", "tn", "chy", "rmy", "cu", "tw", "tum",
                       "xh", "bi", "rn", "pih", "got", "ss", "pnt", "bm", "ch", "mo", "ts",
                       "ady", "iu", "st", "ee", "ny", "fj", "ks", "ak", "ik", "sg", "ve",
                       "dz", "ff", "ti", "cr", "ng", "cho", "kj", "mh", "ho", "ii", "aa", "mus", "hz", "kr"]
 # search-url
 base_url = 'https://{language}.wikipedia.org/'
@ -54,6 +27,7 @@ search_postfix = 'w/api.php?'\
    '&explaintext'\
    '&pithumbsize=300'\
    '&redirects'
 supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
 # set language in base_url
@ -142,3 +116,24 @@ def response(resp):
                    'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
    return results
 # get supported languages from their site
 def fetch_supported_languages():
    supported_languages = {}
    response = get(supported_languages_url)
    dom = fromstring(response.text)
    tables = dom.xpath('//table[contains(@class,"sortable")]')
    for table in tables:
        # exclude header row
        trs = table.xpath('.//tr')[1:]
        for tr in trs:
            td = tr.xpath('./td')
            code = td[3].xpath('./a')[0].text
            name = td[2].xpath('./a')[0].text
            english_name = td[1].xpath('./a')[0].text
            articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
            if articles >= 10000:
                supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
    return supported_languages
--- a/searx/engines/yahoo.py
+++ b/searx/engines/yahoo.py
@ -14,16 +14,13 @@
 from urllib import urlencode
 from urlparse import unquote
 from lxml import html
 from requests import get
 from searx.engines.xpath import extract_text, extract_url
 # engine dependent config
 categories = ['general']
 paging = True
 language_support = True
 supported_languages = ["ar", "bg", "ca", "szh", "tzh", "hr", "cs", "da", "nl", "en",
                       "et", "fi", "fr", "de", "el", "he", "hu", "is", "id", "it", "ja",
                       "ko", "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sk", "sr",
                       "sl", "es", "sv", "th", "tr"]
 time_range_support = True
 # search-url
@ -31,6 +28,8 @@ base_url = 'https://search.yahoo.com/'
 search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
 search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time'
 supported_languages_url = 'https://search.yahoo.com/web/advanced'
 # specific xpath variables
 results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]"
 url_xpath = './/h3/a/@href'
@ -142,3 +141,16 @@ def response(resp):
    # return results
    return results
 # get supported languages from their site
 def fetch_supported_languages():
    supported_languages = []
    response = get(supported_languages_url)
    dom = html.fromstring(response.text)
    options = dom.xpath('//div[@id="yschlang"]/span/label/input')
    for option in options:
        code = option.xpath('./@value')[0][5:]
        supported_languages.append(code)
    return supported_languages
--- a/searx/engines/yahoo_news.py
+++ b/searx/engines/yahoo_news.py
@ -12,7 +12,7 @@
 from urllib import urlencode
 from lxml import html
 from searx.engines.xpath import extract_text, extract_url
-from searx.engines.yahoo import parse_url, supported_languages
+from searx.engines.yahoo import parse_url, fetch_supported_languages
 from datetime import datetime, timedelta
 import re
 from dateutil import parser
--- a/searx/languages.py
+++ b/searx/languages.py
@ -4,39 +4,29 @@
 language_codes = (
    (u"ach", u"Acoli", u"", u""),
-    (u"af", u"Afrikaans", u"", u"Afrikaans"),
+    (u"af", u"Afrikaans", u"", u""),
    (u"ak", u"Akan", u"", u""),
-    (u"als", u"Alemannisch", u"", u"Alemannic"),
+    (u"am", u"አማርኛ", u"", u""),
    (u"am", u"አማርኛ", u"", u"Amharic"),
    (u"an", u"Aragonés", u"", u"Aragonese"),
    (u"ar-SA", u"العربية", u"المملكة العربية السعودية", u"Arabic"),
    (u"arz", u"مصرى (Maṣri)", u"", u"Egyptian Arabic"),
    (u"ast", u"Asturianu", u"", u"Asturian"),
    (u"az", u"Azərbaycanca", u"", u"Azerbaijani"),
    (u"azb", u"تۆرکجه", u"", u"South Azerbaijani"),
    (u"ba", u"Башҡорт", u"", u"Bashkir"),
    (u"ban", u"Balinese", u"", u""),
    (u"bar", u"Boarisch", u"", u"Bavarian"),
    (u"be", u"Беларуская", u"", u"Belarusian"),
    (u"bem", u"Ichibemba", u"", u""),
    (u"bg-BG", u"Български", u"България", u"Bulgarian"),
-    (u"bn", u"বাংলা", u"", u"Bengali"),
+    (u"bn", u"বাংলা", u"", u""),
-    (u"bpy", u"ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী", u"", u"Bishnupriya Manipuri"),
+    (u"br", u"Brezhoneg", u"", u""),
-    (u"br", u"Brezhoneg", u"", u"Breton"),
+    (u"bs", u"Bosanski", u"", u""),
    (u"bs", u"Bosanski", u"", u"Bosnian"),
    (u"bug", u"Basa Ugi", u"", u"Buginese"),
    (u"ca", u"Català", u"", u"Catalan"),
    (u"ca-CT", u"Català", u"", u"Catalan"),
    (u"ca-ES", u"Català", u"Espanya", u"Catalan"),
    (u"ce", u"Нохчийн", u"", u"Chechen"),
    (u"ceb", u"Sinugboanong Binisaya", u"", u"Cebuano"),
    (u"chr", u"ᏣᎳᎩ", u"", u""),
-    (u"ckb", u"Soranî / کوردی", u"", u"Sorani"),
+    (u"ckb", u"Central Kurdish", u"", u""),
    (u"co", u"Corsican", u"", u""),
    (u"crs", u"Seychellois Creole", u"", u""),
    (u"cs-CZ", u"Čeština", u"Česko", u"Czech"),
-    (u"cv", u"Чăваш", u"", u"Chuvash"),
+    (u"cy", u"Cymraeg", u"", u""),
    (u"cy", u"Cymraeg", u"", u"Welsh"),
    (u"da-DK", u"Dansk", u"Danmark", u"Danish"),
    (u"de", u"Deutsch", u"", u"German"),
    (u"de-AT", u"Deutsch", u"Österreich", u"German"),
@ -70,148 +60,129 @@ language_codes = (
    (u"eu", u"Euskara", u"", u"Basque"),
    (u"fa", u"فارسی", u"", u"Persian"),
    (u"fi-FI", u"Suomi", u"Suomi", u"Finnish"),
-    (u"fo", u"Føroyskt", u"", u"Faroese"),
+    (u"fo", u"Føroyskt", u"", u""),
    (u"fr", u"Français", u"", u"French"),
    (u"fr-BE", u"Français", u"Belgique", u"French"),
    (u"fr-CA", u"Français", u"Canada", u"French"),
    (u"fr-CH", u"Français", u"Suisse", u"French"),
    (u"fr-FR", u"Français", u"France", u"French"),
-    (u"fy", u"Frysk", u"", u"West Frisian"),
+    (u"fy", u"West-Frysk", u"", u""),
-    (u"ga", u"Gaeilge", u"", u"Irish"),
+    (u"ga", u"Gaeilge", u"", u""),
    (u"gaa", u"Ga", u"", u""),
-    (u"gd", u"Gàidhlig", u"", u"Scottish Gaelic"),
+    (u"gd", u"Gàidhlig", u"", u""),
    (u"gl", u"Galego", u"", u"Galician"),
    (u"gn", u"Guarani", u"", u""),
-    (u"gu", u"ગુજરાતી", u"", u"Gujarati"),
+    (u"gu", u"ગુજરાતી", u"", u""),
    (u"ha", u"Hausa", u"", u""),
    (u"haw", u"ʻŌlelo HawaiʻI", u"", u""),
    (u"he-IL", u"עברית", u"ישראל", u"Hebrew"),
    (u"hi", u"हिन्दी", u"", u"Hindi"),
    (u"hr-HR", u"Hrvatski", u"Hrvatska", u"Croatian"),
-    (u"hsb", u"Hornjoserbsce", u"", u"Upper Sorbian"),
+    (u"ht", u"Haitian Creole", u"", u""),
    (u"ht", u"Krèyol ayisyen", u"", u"Haitian"),
    (u"hu-HU", u"Magyar", u"Magyarország", u"Hungarian"),
    (u"hy", u"Հայերեն", u"", u"Armenian"),
-    (u"ia", u"Interlingua", u"", u"Interlingua"),
+    (u"ia", u"Interlingua", u"", u""),
    (u"id-ID", u"Bahasa Indonesia", u"Indonesia", u"Indonesian"),
    (u"ig", u"Igbo", u"", u""),
-    (u"io", u"Ido", u"", u"Ido"),
+    (u"is", u"Íslenska", u"", u""),
    (u"is", u"Íslenska", u"", u"Icelandic"),
    (u"it", u"Italiano", u"", u"Italian"),
    (u"it-CH", u"Italiano", u"Svizzera", u"Italian"),
    (u"it-IT", u"Italiano", u"Italia", u"Italian"),
    (u"iw", u"עברית", u"", u""),
    (u"ja-JP", u"日本語", u"日本", u"Japanese"),
    (u"jv", u"Basa Jawa", u"", u"Javanese"),
    (u"ka", u"ქართული", u"", u"Georgian"),
    (u"kg", u"Kongo", u"", u""),
    (u"kk", u"Қазақша", u"", u"Kazakh"),
    (u"km", u"ខ្មែរ", u"", u""),
-    (u"kn", u"ಕನ್ನಡ", u"", u"Kannada"),
+    (u"kn", u"ಕನ್ನಡ", u"", u""),
    (u"ko-KR", u"한국어", u"대한민국", u"Korean"),
-    (u"kri", u"Krio (Sierra Leone)", u"", u""),
+    (u"kri", u"Krio", u"", u""),
-    (u"ku", u"Kurdî / كوردی", u"", u"Kurdish"),
+    (u"ky", u"Кыргызча", u"", u""),
    (u"ky", u"Кыргызча", u"", u"Kirghiz"),
    (u"la", u"Latina", u"", u"Latin"),
    (u"lb", u"Lëtzebuergesch", u"", u"Luxembourgish"),
    (u"lg", u"Luganda", u"", u""),
    (u"li", u"Limburgs", u"", u"Limburgish"),
    (u"lmo", u"Lumbaart", u"", u"Lombard"),
    (u"ln", u"Lingála", u"", u""),
    (u"lo", u"ລາວ", u"", u""),
    (u"loz", u"Lozi", u"", u""),
    (u"lt-LT", u"Lietuvių", u"Lietuva", u"Lithuanian"),
    (u"lua", u"Luba-Lulua", u"", u""),
-    (u"lv-LV", u"Latviešu", u"Latvijas Republika", u"Latvian"),
+    (u"lv-LV", u"Latviešu", u"Latvijas Republika", u""),
    (u"mfe", u"Kreol Morisien", u"", u""),
-    (u"mg", u"Malagasy", u"", u"Malagasy"),
+    (u"mg", u"Malagasy", u"", u""),
    (u"mi", u"Maori", u"", u""),
    (u"min", u"Minangkabau", u"", u"Minangkabau"),
-    (u"mk", u"Македонски", u"", u"Macedonian"),
+    (u"mk", u"Македонски", u"", u""),
-    (u"ml", u"മലയാളം", u"", u"Malayalam"),
+    (u"ml", u"മലയാളം", u"", u""),
-    (u"mn", u"Монгол", u"", u"Mongolian"),
+    (u"mn", u"Монгол", u"", u""),
-    (u"mr", u"मराठी", u"", u"Marathi"),
+    (u"mr", u"मराठी", u"", u""),
    (u"mrj", u"Кырык Мары (Kyryk Mary)", u"", u"Hill Mari"),
    (u"ms-MY", u"Bahasa Melayu", u"Malaysia", u"Malay"),
    (u"mt", u"Malti", u"", u""),
-    (u"my", u"မြန်မာဘာသာ", u"", u"Burmese"),
+    (u"my", u"ဗမာ", u"", u""),
-    (u"mzn", u"مَزِروني", u"", u"Mazandarani"),
+    (u"nb-NO", u"Norwegian Bokmål", u"Norge", u"Norwegian Bokmål"),
-    (u"nah", u"Nāhuatl", u"", u"Nahuatl"),
+    (u"ne", u"नेपाली", u"", u""),
    (u"nap", u"Nnapulitano", u"", u"Neapolitan"),
    (u"nds-nl", u"Plattdüütsch", u"Nedderlannen", u"Low Saxon"),
    (u"ne", u"नेपाली", u"", u"Nepali"),
    (u"new", u"नेपाल भाषा", u"", u"Newar"),
    (u"nl", u"Nederlands", u"", u"Dutch"),
    (u"nl-BE", u"Nederlands", u"België", u"Dutch"),
    (u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"),
-    (u"nn", u"Nynorsk", u"", u"Norwegian (Nynorsk)"),
+    (u"nn", u"Nynorsk", u"", u"Norwegian"),
-    (u"no-NO", u"Norsk (Bokmål)", u"Norge", u"Norwegian (Bokmål)"),
+    (u"no-NO", u"Norsk", u"Norge", u"Norwegian"),
    (u"nso", u"Northern Sotho", u"", u""),
    (u"ny", u"Nyanja", u"", u""),
    (u"nyn", u"Runyankore", u"", u""),
-    (u"oc", u"Occitan", u"", u"Occitan"),
+    (u"oc", u"Occitan", u"", u""),
    (u"om", u"Oromoo", u"", u""),
-    (u"or", u"ଓଡ଼ିଆ", u"", u"Oriya"),
+    (u"or", u"ଓଡ଼ିଆ", u"", u""),
-    (u"os", u"Иронау", u"", u"Ossetian"),
+    (u"pa", u"ਪੰਜਾਬੀ", u"", u""),
    (u"pa", u"ਪੰਜਾਬੀ", u"", u"Punjabi"),
    (u"pcm", u"Nigerian Pidgin", u"", u""),
    (u"pl-PL", u"Polski", u"Rzeczpospolita Polska", u"Polish"),
    (u"pms", u"Piemontèis", u"", u"Piedmontese"),
    (u"pnb", u"شاہ مکھی پنجابی (Shāhmukhī Pañjābī)", u"", u"Western Punjabi"),
    (u"ps", u"پښتو", u"", u""),
    (u"pt", u"Português", u"", u"Portuguese"),
    (u"pt-BR", u"Português", u"Brasil", u"Portuguese"),
    (u"pt-PT", u"Português", u"Portugal", u"Portuguese"),
-    (u"qu", u"Runa Simi", u"", u"Quechua"),
+    (u"qu", u"Runasimi", u"", u""),
    (u"rm", u"Rumantsch", u"", u""),
    (u"rn", u"Ikirundi", u"", u""),
    (u"ro-RO", u"Română", u"România", u"Romanian"),
    (u"ru-RU", u"Русский", u"Россия", u"Russian"),
    (u"rw", u"Kinyarwanda", u"", u""),
    (u"sa", u"संस्कृतम्", u"", u"Sanskrit"),
    (u"sah", u"Саха тыла (Saxa Tyla)", u"", u"Sakha"),
    (u"scn", u"Sicilianu", u"", u"Sicilian"),
    (u"sco", u"Scots", u"", u"Scots"),
    (u"sd", u"Sindhi", u"", u""),
    (u"sh", u"Srpskohrvatski / Српскохрватски", u"", u"Serbo-Croatian"),
-    (u"si", u"සිංහල", u"", u"Sinhalese"),
+    (u"si", u"සිංහල", u"", u""),
    (u"sk-SK", u"Slovenčina", u"Slovenská republika", u"Slovak"),
-    (u"sl-SI", u"Slovenščina", u"Slovenija", u"Slovenian"),
+    (u"sl", u"Slovenščina", u"", u"Slovenian"),
    (u"sn", u"Chishona", u"", u""),
    (u"so", u"Soomaali", u"", u""),
-    (u"sq", u"Shqip", u"", u"Albanian"),
+    (u"sq", u"Shqip", u"", u""),
-    (u"sr-ME", u"Српски / Srpski", u"Црна Гора", u"Serbian"),
+    (u"sr", u"Српски / Srpski", u"", u"Serbian"),
    (u"st", u"Southern Sotho", u"", u""),
-    (u"su", u"Basa Sunda", u"", u"Sundanese"),
+    (u"su", u"Sundanese", u"", u""),
    (u"sv-SE", u"Svenska", u"Sverige", u"Swedish"),
-    (u"sw", u"Kiswahili", u"", u"Swahili"),
+    (u"sw", u"Kiswahili", u"", u""),
-    (u"ta", u"தமிழ்", u"", u"Tamil"),
+    (u"ta", u"தமிழ்", u"", u""),
-    (u"te", u"తెలుగు", u"", u"Telugu"),
+    (u"te", u"తెలుగు", u"", u""),
-    (u"tg", u"Тоҷикӣ", u"", u"Tajik"),
+    (u"tg", u"Tajik", u"", u""),
    (u"th-TH", u"ไทย", u"ไทย", u"Thai"),
    (u"ti", u"ትግርኛ", u"", u""),
    (u"tk", u"Turkmen", u"", u""),
-    (u"tl-PH", u"Tagalog", u"Pilipinas", u"Tagalog"),
+    (u"tl-PH", u"Filipino", u"Pilipinas", u""),
    (u"tlh", u"Klingon", u"", u""),
    (u"tn", u"Tswana", u"", u""),
    (u"to", u"Lea Fakatonga", u"", u""),
    (u"tr-TR", u"Türkçe", u"Türkiye", u"Turkish"),
-    (u"tt", u"Tatarça / Татарча", u"", u"Tatar"),
+    (u"tt", u"Tatar", u"", u""),
    (u"tum", u"Tumbuka", u"", u""),
    (u"tw", u"Twi", u"", u""),
    (u"ug", u"ئۇيغۇرچە", u"", u""),
    (u"uk-UA", u"Українська", u"Україна", u"Ukrainian"),
    (u"ur", u"اردو", u"", u"Urdu"),
    (u"uz", u"O‘zbek", u"", u"Uzbek"),
-    (u"vec", u"Vèneto", u"", u"Venetian"),
+    (u"ve", u"Venda", u"", u"Venda"),
    (u"vi-VN", u"Tiếng Việt", u"Công Hòa Xã Hội Chủ Nghĩa Việt Nam", u"Vietnamese"),
    (u"vo", u"Volapük", u"", u"Volapük"),
    (u"wa", u"Walon", u"", u"Walloon"),
    (u"war", u"Winaray", u"", u"Waray-Waray"),
    (u"wo", u"Wolof", u"", u""),
    (u"xh", u"Xhosa", u"", u""),
-    (u"yi", u"ייִדיש", u"", u"Yiddish"),
+    (u"yi", u"ייִדיש", u"", u""),
-    (u"yo", u"Yorùbá", u"", u"Yoruba"),
+    (u"yo", u"Èdè Yorùbá", u"", u""),
    (u"zh", u"中文", u"", u"Chinese"),
    (u"zh-CN", u"中文", u"中国", u"Chinese"),
    (u"zh-HK", u"中文", u"香港", u"Chinese"),
--- a/searx/webapp.py
+++ b/searx/webapp.py
@ -514,7 +514,7 @@ def index():
        answers=result_container.answers,
        infoboxes=result_container.infoboxes,
        paging=result_container.paging,
-        current_language=search.lang,
+        current_language=search_query.lang,
        base_url=get_base_url(),
        theme=get_current_theme_name(),
        favicons=global_favicons[themes.index(get_current_theme_name())]
--- a/tests/unit/engines/test_subtitleseeker.py
+++ b/tests/unit/engines/test_subtitleseeker.py
@ -17,7 +17,7 @@ class TestSubtitleseekerEngine(SearxTestCase):
    def test_response(self):
        dicto = defaultdict(dict)
-        dicto['language'] = 'fr_FR'
+        dicto['language'] = 'fr-FR'
        response = mock.Mock(search_params=dicto)
        self.assertRaises(AttributeError, subtitleseeker.response, None)
--- a/tests/unit/engines/test_wikipedia.py
+++ b/tests/unit/engines/test_wikipedia.py
@ -8,6 +8,8 @@ from searx.testing import SearxTestCase
 class TestWikipediaEngine(SearxTestCase):
    def test_request(self):
        wikipedia.supported_languages = ['fr', 'en']
        query = 'test_query'
        dicto = defaultdict(dict)
        dicto['language'] = 'fr-FR'
--- a/utils/fetch_languages.py
+++ b/utils/fetch_languages.py
@ -0,0 +1,164 @@
 # -*- coding: utf-8 -*-
 # This script generates languages.py from intersecting each engine's supported languages.
 #
 # The country names are obtained from http://api.geonames.org which requires registering as a user.
 #
 # Output files (engines_languages.json and languages.py)
 # are written in current directory to avoid overwriting in case something goes wrong.
 from requests import get
 from urllib import urlencode
 from lxml.html import fromstring
 from json import loads, dumps
 import io
 from sys import path
 path.append('../searx')  # noqa
 from searx.engines import engines
 # Geonames API for country names.
 geonames_user = ''  # ADD USER NAME HERE
 country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
 # Output files.
 engines_languages_file = 'engines_languages.json'
 languages_file = 'languages.py'
 engines_languages = {}
 languages = {}
 # To filter out invalid codes and dialects.
 def valid_code(lang_code):
    # filter invalid codes
    # sl-SL is technically not invalid, but still a mistake
    if lang_code[:2] == 'xx'\
       or lang_code == 'sl-SL'\
       or lang_code == 'wt-WT'\
       or lang_code == 'jw'\
       or lang_code[-2:] == 'UK'\
       or lang_code[-2:] == 'XA'\
       or lang_code[-2:] == 'XL':
        return False
    # filter dialects
    lang_code = lang_code.split('-')
    if len(lang_code) > 2 or len(lang_code[0]) > 3:
        return False
    if len(lang_code) == 2 and len(lang_code[1]) > 2:
        return False
    return True
 # Get country name in specified language.
 def get_country_name(locale):
    if geonames_user is '':
        return ''
    locale = locale.split('-')
    if len(locale) != 2:
        return ''
    url = country_names_url.format(parameters=urlencode({'lang': locale[0],
                                                         'country': locale[1],
                                                         'username': geonames_user}))
    response = get(url)
    json = loads(response.text)
    content = json.get('geonames', None)
    if content is None or len(content) != 1:
        print "No country name found for " + locale[0] + "-" + locale[1]
        return ''
    return content[0].get('countryName', '')
 # Fetchs supported languages for each engine and writes json file with those.
 def fetch_supported_languages():
    for engine_name in engines:
        if hasattr(engines[engine_name], 'fetch_supported_languages'):
            try:
                engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
            except Exception as e:
                print e
    # write json file
    f = io.open(engines_languages_file, "w", encoding="utf-8")
    f.write(unicode(dumps(engines_languages, indent=4, ensure_ascii=False, encoding="utf-8")))
    f.close()
 # Join all language lists.
 # Iterate all languages supported by each engine.
 def join_language_lists():
    # include wikipedia first for more accurate language names
    # exclude languages with too few articles
    languages.update({code: lang for code, lang
                      in engines_languages['wikipedia'].iteritems()
                      if valid_code(code) and lang['articles'] >= 100000})
    for engine_name in engines_languages:
        for locale in engines_languages[engine_name]:
            if not valid_code(locale):
                continue
            # if language is not on list or if it has no name yet
            if locale not in languages or not languages[locale].get('name'):
                if isinstance(engines_languages[engine_name], dict) \
                  and engines_languages[engine_name][locale].get('articles', float('inf')) >= 100000:
                    languages[locale] = engines_languages[engine_name][locale]
                else:
                    languages[locale] = {}
    # get locales that have no name yet
    for locale in languages.keys():
        if not languages[locale].get('name'):
            # try to get language and country names
            name = languages.get(locale.split('-')[0], {}).get('name', None)
            if name:
                languages[locale]['name'] = name
                languages[locale]['country'] = get_country_name(locale) or ''
                languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
            else:
                # filter out locales with no name
                del languages[locale]
 # Remove countryless language if language is featured in only one country.
 def filter_single_country_languages():
    prev_lang = None
    for code in sorted(languages):
        lang = code.split('-')[0]
        if lang == prev_lang:
            countries += 1
        else:
            if prev_lang is not None and countries == 1:
                del languages[prev_lang]
            countries = 0
            prev_lang = lang
 # Write languages.py.
 def write_languages_file():
    new_file = open(languages_file, 'w')
    file_content = '# -*- coding: utf-8 -*-\n'
    file_content += '# list of language codes\n'
    file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
    file_content += '\nlanguage_codes = ('
    for code in sorted(languages):
        file_content += '\n    (u"' + code + '"'\
                        + ', u"' + languages[code]['name'].split(' (')[0] + '"'\
                        + ', u"' + languages[code].get('country', '') + '"'\
                        + ', u"' + languages[code].get('english_name', '').split(' (')[0] + '"),'
    # remove last comma
    file_content = file_content[:-1]
    file_content += '\n)\n'
    new_file.write(file_content.encode('utf8'))
    new_file.close()
 if __name__ == "__main__":
    fetch_supported_languages()
    join_language_lists()
    filter_single_country_languages()
    write_languages_file()
--- a/utils/update_languages.py
+++ b/utils/update_languages.py
@ -1,169 +0,0 @@
 # -*- coding: utf-8 -*-
 # This script generates languages.py from
 # intersecting each engine's supported languages.
 #
 # The language's native names are obtained from
 # Wikipedia and Google's supported languages.
 #
 # The country names are obtained from http://api.geonames.org
 # which requires registering as a user.
 #
 # Output file (languages.py) is written in current directory
 # to avoid overwriting in case something goes wrong.
 from requests import get
 from urllib import urlencode
 from lxml.html import fromstring
 from json import loads
 from sys import path
 path.append('../searx')
 from searx.engines import engines
 # list of names
 wiki_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
 google_languages_url = 'https://www.google.com/preferences?#languages'
 country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
 geonames_user = ''  # add user name here
 google_json_name = 'google.preferences.langMap'
 languages = {}
 # To filter out invalid codes and dialects.
 def valid_code(lang_code):
    # filter invalid codes
    # sl-SL is technically not invalid, but still a mistake
    if lang_code[:2] == 'xx'\
       or lang_code == 'sl-SL'\
       or lang_code == 'jw'\
       or lang_code[-2:] == 'UK'\
       or lang_code[-2:] == 'XA'\
       or lang_code[-2:] == 'XL':
        return False
    # filter dialects
    lang_code = lang_code.split('-')
    if len(lang_code) > 2 or len(lang_code[0]) > 3:
        return False
    if len(lang_code) == 2 and len(lang_code[1]) > 2:
        return False
    return True
 # Get country name in specified language.
 def get_country_name(locale):
    if geonames_user is '':
        return ''
    locale = locale.split('-')
    if len(locale) != 2:
        return ''
    url = country_names_url.format(parameters=urlencode({'lang': locale[0],
                                                         'country': locale[1],
                                                         'username': geonames_user}))
    response = get(url)
    json = loads(response.text)
    content = json.get('geonames', None)
    if content is None or len(content) != 1:
        print "No country name found for " + locale[0] + "-" + locale[1]
        print json
        return ''
    return content[0].get('countryName', '')
 # Get language names from Wikipedia.
 def get_wikipedia_languages():
    response = get(wiki_languages_url)
    dom = fromstring(response.text)
    tables = dom.xpath('//table[contains(@class,"sortable")]')
    for table in tables:
        # exclude header row
        trs = table.xpath('.//tr')[1:]
        for tr in trs:
            td = tr.xpath('./td')
            code = td[3].xpath('./a')[0].text
            name = td[2].xpath('./a')[0].text
            english_name = td[1].xpath('./a')[0].text
            articles = int(td[4].xpath('./a/b')[0].text.replace(',',''))
            # exclude language variants and languages with few articles
            if code not in languages and articles >= 10000 and valid_code(code):
                languages[code] = (name, '', english_name)
 # Get language names from Google.
 def get_google_languages():
    response = get(google_languages_url)
    dom = fromstring(response.text)
    options = dom.xpath('//select[@name="hl"]/option')
    for option in options:
        code = option.xpath('./@value')[0].split('-')[0]
        name = option.text[:-1].title()
        if code not in languages and valid_code(code):
            languages[code] = (name, '', '')
 # Join all language lists.
 # iterate all languages supported by each engine
 def join_language_lists():
    for engine_name in engines:
        for locale in engines[engine_name].supported_languages:
            locale = locale.replace('_', '-')
            if locale not in languages and valid_code(locale):
                # try to get language name
                language = languages.get(locale.split('-')[0], None)
                if language == None:
                    print engine_name + ": " + locale
                    continue
                country = get_country_name(locale)
                languages[locale] = (language[0], country, language[2])
 # Remove countryless language if language is featured in only one country.
 def filter_single_country_languages():
    prev_lang = None
    for code in sorted(languages):
        lang = code.split('-')[0]
        if lang == prev_lang:
            countries += 1
        else:
            if prev_lang is not None and countries == 1:
                del languages[prev_lang]
            countries = 0
            prev_lang = lang
 # Write languages.py.
 def write_languages_file():
    new_file = open('languages.py', 'w')
    file_content = '# -*- coding: utf-8 -*-\n'
    file_content += '# list of language codes\n'
    file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
    file_content += '\nlanguage_codes = ('
    for code in sorted(languages):
        (name, country, english) = languages[code]
        file_content += '\n    (u"' + code + '"'\
                        + ', u"' + name + '"'\
                        + ', u"' + country + '"'\
                        + ', u"' + english + '"),'
    # remove last comma
    file_content = file_content[:-1]
    file_content += '\n)\n'
    new_file.write(file_content.encode('utf8'))
    new_file.close()
 if __name__ == "__main__":
    get_wikipedia_languages()
    get_google_languages()
    join_language_lists()
    filter_single_country_languages()
    write_languages_file()