From 934ae4e086a26d1c9c8d25946b43789e55696478 Mon Sep 17 00:00:00 2001 From: Austin Huang Date: Thu, 31 Mar 2022 14:45:39 -0400 Subject: [PATCH 1/4] (feat) add jisho.org Closes #1016 --- searx/engines/jisho.py | 125 +++++++++++++++++++++++++++++++++++++++++ searx/settings.yml | 6 ++ 2 files changed, 131 insertions(+) create mode 100644 searx/engines/jisho.py diff --git a/searx/engines/jisho.py b/searx/engines/jisho.py new file mode 100644 index 00000000..6fab054e --- /dev/null +++ b/searx/engines/jisho.py @@ -0,0 +1,125 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" +Jisho (the Japanese-English dictionary) +""" + +import json +from urllib.parse import urlencode, urljoin + +# about +about = { + "website": 'https://jisho.org', + "wikidata_id": 'Q24568389', + "official_api_documentation": "https://jisho.org/forum/54fefc1f6e73340b1f160000-is-there-any-kind-of-search-api", + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['dictionaries'] +paging = False + +URL = 'https://jisho.org' +BASE_URL = 'https://jisho.org/word/' +SEARCH_URL = URL + '/api/v1/search/words?{query}' + + +def request(query, params): + query = urlencode({'keyword': query}) + params['url'] = SEARCH_URL.format(query=query) + logger.debug(f"query_url --> {params['url']}") + return params + + +def response(resp): + results = [] + infoboxed = False + + search_results = json.loads(resp.text) + pages = search_results.get('data', []) + + for page in pages: + # Entries that are purely from Wikipedia are excluded. + if page['senses'][0]['parts_of_speech'][0] != 'Wikipedia definition': + # Process alternative forms + japanese = page['japanese'] + alt_forms = [] + for title_raw in japanese: + if 'word' not in title_raw: + alt_forms.append(title_raw['reading']) + else: + title = title_raw['word'] + if 'reading' in title_raw: + title += ' (' + title_raw['reading'] + ')' + alt_forms.append(title) + # Process definitions + definitions = [] + def_raw = page['senses'] + for defn_raw in def_raw: + extra = '' + if not infoboxed: + # Extra data. Since they're not documented, this implementation is based solely by the author's assumptions. + if defn_raw['tags'] != []: + if defn_raw['info'] != []: + extra += defn_raw['tags'][0] + ', ' + defn_raw['info'][0] + '. ' # "usually written as kana: " + else: + extra += ', '.join(defn_raw['tags']) + '. ' # abbreviation, archaism, etc. + elif defn_raw['info'] != []: + extra += ', '.join(defn_raw['info']).capitalize() + '. ' # inconsistent + if defn_raw['restrictions'] != []: + extra += 'Only applies to: ' + ', '.join(defn_raw['restrictions']) + '. ' + extra = extra[:-1] + definitions.append(( + ', '.join(defn_raw['parts_of_speech']), + '; '.join(defn_raw['english_definitions']), + extra + )) + content = '' + infobox_content = ''' + JMdict + and JMnedict + by EDRDG, CC BY-SA 3.0.Wikipedia, CC BY-SA 3.0.' + + # For results, we'll return the URL, all alternative forms (as title), + # and all definitions (as description) truncated to 300 characters. + results.append({ + 'url': urljoin(BASE_URL, page['slug']), + 'title': ", ".join(alt_forms), + 'content': content[:300] + (content[300:] and '...') + }) + + # Like Wordnik, we'll return the first result in an infobox too. + if not infoboxed: + infoboxed = True + infobox_urls = [] + infobox_urls.append({ + 'title': 'Jisho.org', + 'url': urljoin(BASE_URL, page['slug']) + }) + infobox = { + 'infobox': alt_forms[0], + 'urls': infobox_urls + } + alt_forms.pop(0) + alt_content = '' + if len(alt_forms) > 0: + alt_content = '

Other forms: ' + alt_content += ", ".join(alt_forms) + alt_content += '

' + infobox['content'] = alt_content + infobox_content + results.append(infobox) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 9e9f1f27..48b07454 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -798,6 +798,12 @@ engines: timeout: 3.0 disabled: true + - name: jisho + engine: jisho + shortcut: js + timeout: 4.0 + disabled: true + - name: kickass engine: kickass shortcut: kc From a399248f56e6975c78f617defc5ce7df2f62a828 Mon Sep 17 00:00:00 2001 From: Austin Huang Date: Fri, 1 Apr 2022 09:18:19 -0400 Subject: [PATCH 2/4] update jisho.py according to suggestions --- searx/engines/jisho.py | 165 +++++++++++++++++++++-------------------- 1 file changed, 84 insertions(+), 81 deletions(-) diff --git a/searx/engines/jisho.py b/searx/engines/jisho.py index 6fab054e..c1324635 100644 --- a/searx/engines/jisho.py +++ b/searx/engines/jisho.py @@ -14,9 +14,11 @@ about = { "use_official_api": True, "require_api_key": False, "results": 'JSON', + "language": 'ja', } categories = ['dictionaries'] +engine_type = 'online_dictionary' paging = False URL = 'https://jisho.org' @@ -35,91 +37,92 @@ def response(resp): results = [] infoboxed = False - search_results = json.loads(resp.text) + search_results = resp.json() pages = search_results.get('data', []) for page in pages: # Entries that are purely from Wikipedia are excluded. - if page['senses'][0]['parts_of_speech'][0] != 'Wikipedia definition': - # Process alternative forms - japanese = page['japanese'] - alt_forms = [] - for title_raw in japanese: - if 'word' not in title_raw: - alt_forms.append(title_raw['reading']) - else: - title = title_raw['word'] - if 'reading' in title_raw: - title += ' (' + title_raw['reading'] + ')' - alt_forms.append(title) - # Process definitions - definitions = [] - def_raw = page['senses'] - for defn_raw in def_raw: - extra = '' - if not infoboxed: - # Extra data. Since they're not documented, this implementation is based solely by the author's assumptions. - if defn_raw['tags'] != []: - if defn_raw['info'] != []: - extra += defn_raw['tags'][0] + ', ' + defn_raw['info'][0] + '. ' # "usually written as kana: " - else: - extra += ', '.join(defn_raw['tags']) + '. ' # abbreviation, archaism, etc. - elif defn_raw['info'] != []: - extra += ', '.join(defn_raw['info']).capitalize() + '. ' # inconsistent - if defn_raw['restrictions'] != []: - extra += 'Only applies to: ' + ', '.join(defn_raw['restrictions']) + '. ' - extra = extra[:-1] - definitions.append(( - ', '.join(defn_raw['parts_of_speech']), - '; '.join(defn_raw['english_definitions']), - extra - )) - content = '' - infobox_content = ''' - JMdict - and JMnedict - by EDRDG, CC BY-SA 3.0.
    - ''' - for pos, engdef, extra in definitions: - if pos == 'Wikipedia definition': - infobox_content += '
Wikipedia, CC BY-SA 3.0.
    ' - if pos == '': - infobox_content += f"
  • {engdef}" - else: - infobox_content += f"
  • {pos}: {engdef}" - if extra != '': - infobox_content += f" ({extra})" - infobox_content += '
  • ' - content += f"{engdef}. " - infobox_content += '
' - - # For results, we'll return the URL, all alternative forms (as title), - # and all definitions (as description) truncated to 300 characters. - results.append({ - 'url': urljoin(BASE_URL, page['slug']), - 'title': ", ".join(alt_forms), - 'content': content[:300] + (content[300:] and '...') - }) - - # Like Wordnik, we'll return the first result in an infobox too. + if page['senses'][0]['parts_of_speech'] != [] and page['senses'][0]['parts_of_speech'][0] == 'Wikipedia definition': + pass + # Process alternative forms + japanese = page['japanese'] + alt_forms = [] + for title_raw in japanese: + if 'word' not in title_raw: + alt_forms.append(title_raw['reading']) + else: + title = title_raw['word'] + if 'reading' in title_raw: + title += ' (' + title_raw['reading'] + ')' + alt_forms.append(title) + # Process definitions + definitions = [] + def_raw = page['senses'] + for defn_raw in def_raw: + extra = '' if not infoboxed: - infoboxed = True - infobox_urls = [] - infobox_urls.append({ - 'title': 'Jisho.org', - 'url': urljoin(BASE_URL, page['slug']) - }) - infobox = { - 'infobox': alt_forms[0], - 'urls': infobox_urls - } - alt_forms.pop(0) - alt_content = '' - if len(alt_forms) > 0: - alt_content = '

Other forms: ' - alt_content += ", ".join(alt_forms) - alt_content += '

' - infobox['content'] = alt_content + infobox_content - results.append(infobox) + # Extra data. Since they're not documented, this implementation is based solely by the author's assumptions. + if defn_raw['tags'] != []: + if defn_raw['info'] != []: + extra += defn_raw['tags'][0] + ', ' + defn_raw['info'][0] + '. ' # "usually written as kana: " + else: + extra += ', '.join(defn_raw['tags']) + '. ' # abbreviation, archaism, etc. + elif defn_raw['info'] != []: + extra += ', '.join(defn_raw['info']).capitalize() + '. ' # inconsistent + if defn_raw['restrictions'] != []: + extra += 'Only applies to: ' + ', '.join(defn_raw['restrictions']) + '. ' + extra = extra[:-1] + definitions.append(( + ', '.join(defn_raw['parts_of_speech']), + '; '.join(defn_raw['english_definitions']), + extra + )) + content = '' + infobox_content = ''' + JMdict + and JMnedict + by EDRDG, CC BY-SA 3.0.
    + ''' + for pos, engdef, extra in definitions: + if pos == 'Wikipedia definition': + infobox_content += '
Wikipedia, CC BY-SA 3.0.
    ' + if pos == '': + infobox_content += f"
  • {engdef}" + else: + infobox_content += f"
  • {pos}: {engdef}" + if extra != '': + infobox_content += f" ({extra})" + infobox_content += '
  • ' + content += f"{engdef}. " + infobox_content += '
' + + # For results, we'll return the URL, all alternative forms (as title), + # and all definitions (as description) truncated to 300 characters. + results.append({ + 'url': urljoin(BASE_URL, page['slug']), + 'title': ", ".join(alt_forms), + 'content': content[:300] + (content[300:] and '...') + }) + + # Like Wordnik, we'll return the first result in an infobox too. + if not infoboxed: + infoboxed = True + infobox_urls = [] + infobox_urls.append({ + 'title': 'Jisho.org', + 'url': urljoin(BASE_URL, page['slug']) + }) + infobox = { + 'infobox': alt_forms[0], + 'urls': infobox_urls + } + alt_forms.pop(0) + alt_content = '' + if len(alt_forms) > 0: + alt_content = '

Other forms: ' + alt_content += ", ".join(alt_forms) + alt_content += '

' + infobox['content'] = alt_content + infobox_content + results.append(infobox) return results From 19fa0095a0ab12ed1f7a79d91edf862faf6fdfcf Mon Sep 17 00:00:00 2001 From: Austin Huang Date: Fri, 1 Apr 2022 09:23:24 -0400 Subject: [PATCH 3/4] (fix) satisfy the linter, and btw reduce timeout --- searx/engines/jisho.py | 1 - searx/settings.yml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/searx/engines/jisho.py b/searx/engines/jisho.py index c1324635..a34d8e42 100644 --- a/searx/engines/jisho.py +++ b/searx/engines/jisho.py @@ -3,7 +3,6 @@ Jisho (the Japanese-English dictionary) """ -import json from urllib.parse import urlencode, urljoin # about diff --git a/searx/settings.yml b/searx/settings.yml index 48b07454..eee0e1d7 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -801,7 +801,7 @@ engines: - name: jisho engine: jisho shortcut: js - timeout: 4.0 + timeout: 3.0 disabled: true - name: kickass From 74c7aee9ec52e6b954e48817501a334f23a40e25 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Sat, 2 Apr 2022 15:21:58 +0200 Subject: [PATCH 4/4] jisho : code refactoring --- searx/engines/jisho.py | 143 ++++++++++++++++++++++------------------- 1 file changed, 76 insertions(+), 67 deletions(-) diff --git a/searx/engines/jisho.py b/searx/engines/jisho.py index a34d8e42..87bbe983 100644 --- a/searx/engines/jisho.py +++ b/searx/engines/jisho.py @@ -17,7 +17,6 @@ about = { } categories = ['dictionaries'] -engine_type = 'online_dictionary' paging = False URL = 'https://jisho.org' @@ -34,19 +33,19 @@ def request(query, params): def response(resp): results = [] - infoboxed = False + first_result = True search_results = resp.json() - pages = search_results.get('data', []) - for page in pages: + for page in search_results.get('data', []): # Entries that are purely from Wikipedia are excluded. - if page['senses'][0]['parts_of_speech'] != [] and page['senses'][0]['parts_of_speech'][0] == 'Wikipedia definition': + parts_of_speech = page.get('senses') and page['senses'][0].get('parts_of_speech') + if parts_of_speech and parts_of_speech[0] == 'Wikipedia definition': pass + # Process alternative forms - japanese = page['japanese'] alt_forms = [] - for title_raw in japanese: + for title_raw in page['japanese']: if 'word' not in title_raw: alt_forms.append(title_raw['reading']) else: @@ -54,74 +53,84 @@ def response(resp): if 'reading' in title_raw: title += ' (' + title_raw['reading'] + ')' alt_forms.append(title) - # Process definitions - definitions = [] - def_raw = page['senses'] - for defn_raw in def_raw: - extra = '' - if not infoboxed: - # Extra data. Since they're not documented, this implementation is based solely by the author's assumptions. - if defn_raw['tags'] != []: - if defn_raw['info'] != []: - extra += defn_raw['tags'][0] + ', ' + defn_raw['info'][0] + '. ' # "usually written as kana: " - else: - extra += ', '.join(defn_raw['tags']) + '. ' # abbreviation, archaism, etc. - elif defn_raw['info'] != []: - extra += ', '.join(defn_raw['info']).capitalize() + '. ' # inconsistent - if defn_raw['restrictions'] != []: - extra += 'Only applies to: ' + ', '.join(defn_raw['restrictions']) + '. ' - extra = extra[:-1] - definitions.append(( - ', '.join(defn_raw['parts_of_speech']), - '; '.join(defn_raw['english_definitions']), - extra - )) - content = '' - infobox_content = ''' - JMdict - and JMnedict - by EDRDG, CC BY-SA 3.0.
    - ''' - for pos, engdef, extra in definitions: - if pos == 'Wikipedia definition': - infobox_content += '
Wikipedia, CC BY-SA 3.0.
    ' - if pos == '': - infobox_content += f"
  • {engdef}" - else: - infobox_content += f"
  • {pos}: {engdef}" - if extra != '': - infobox_content += f" ({extra})" - infobox_content += '
  • ' - content += f"{engdef}. " - infobox_content += '
' + # + result_url = urljoin(BASE_URL, page['slug']) + definitions = get_definitions(page) + # For results, we'll return the URL, all alternative forms (as title), # and all definitions (as description) truncated to 300 characters. + content = " ".join(f"{engdef}." for _, engdef, _ in definitions) results.append({ - 'url': urljoin(BASE_URL, page['slug']), + 'url': result_url, 'title': ", ".join(alt_forms), 'content': content[:300] + (content[300:] and '...') }) # Like Wordnik, we'll return the first result in an infobox too. - if not infoboxed: - infoboxed = True - infobox_urls = [] - infobox_urls.append({ - 'title': 'Jisho.org', - 'url': urljoin(BASE_URL, page['slug']) - }) - infobox = { - 'infobox': alt_forms[0], - 'urls': infobox_urls - } - alt_forms.pop(0) - alt_content = '' - if len(alt_forms) > 0: - alt_content = '

Other forms: ' - alt_content += ", ".join(alt_forms) - alt_content += '

' - infobox['content'] = alt_content + infobox_content - results.append(infobox) + if first_result: + first_result = False + results.append(get_infobox(alt_forms, result_url, definitions)) return results + + +def get_definitions(page): + # Process definitions + definitions = [] + for defn_raw in page['senses']: + extra = [] + # Extra data. Since they're not documented, this implementation is based solely by the author's assumptions. + if defn_raw.get('tags'): + if defn_raw.get('info'): + # "usually written as kana: " + extra.append(defn_raw['tags'][0] + ', ' + defn_raw['info'][0] + '. ') + else: + # abbreviation, archaism, etc. + extra.append(', '.join(defn_raw['tags']) + '. ') + elif defn_raw.get('info'): + # inconsistent + extra.append(', '.join(defn_raw['info']).capitalize() + '. ') + if defn_raw.get('restrictions'): + extra.append('Only applies to: ' + ', '.join(defn_raw['restrictions']) + '. ') + definitions.append(( + ', '.join(defn_raw['parts_of_speech']), + '; '.join(defn_raw['english_definitions']), + ''.join(extra)[:-1], + )) + return definitions + + +def get_infobox(alt_forms, result_url, definitions): + infobox_content = [] + # title & alt_forms + infobox_title = alt_forms[0] + if len(alt_forms) > 1: + infobox_content.append(f'

Other forms: {", ".join(alt_forms[1:])}

') + + # definitions + infobox_content.append(''' + JMdict + and JMnedict + by EDRDG, CC BY-SA 3.0. +
    + ''') + for pos, engdef, extra in definitions: + if pos == 'Wikipedia definition': + infobox_content.append('
Wikipedia, CC BY-SA 3.0.
    ') + pos = f'{pos}: ' if pos else '' + extra = f' ({extra})' if extra else '' + infobox_content.append(f'
  • {pos}{engdef}{extra}
  • ') + infobox_content.append('
') + + # + return { + 'infobox': infobox_title, + 'content': ''.join(infobox_content), + 'urls': [ + { + 'title': 'Jisho.org', + 'url': result_url, + } + ] + }