Merge pull request #152 from pointhi/search_engines

[enh] add photon engine, and fix pep8 errors
This commit is contained in:
Adam Tauber 2014-12-19 20:06:21 +01:00
commit 813247b37a
11 changed files with 200 additions and 38 deletions

View File

@ -57,12 +57,16 @@ def response(resp):
link = result.xpath('.//div[@class="newstitle"]/a')[0] link = result.xpath('.//div[@class="newstitle"]/a')[0]
url = link.attrib.get('href') url = link.attrib.get('href')
title = ' '.join(link.xpath('.//text()')) title = ' '.join(link.xpath('.//text()'))
contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]//text()') contentXPath = result.xpath('.//div[@class="sn_txt"]/div'
'//span[@class="sn_snip"]//text()')
if contentXPath is not None: if contentXPath is not None:
content = escape(' '.join(contentXPath)) content = escape(' '.join(contentXPath))
# parse publishedDate # parse publishedDate
publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div//span[contains(@class,"sn_ST")]//span[contains(@class,"sn_tm")]//text()') publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
'//span[contains(@class,"sn_ST")]'
'//span[contains(@class,"sn_tm")]'
'//text()')
if publishedDateXPath is not None: if publishedDateXPath is not None:
publishedDate = escape(' '.join(publishedDateXPath)) publishedDate = escape(' '.join(publishedDateXPath))
@ -74,7 +78,8 @@ def response(resp):
timeNumbers = re.findall(r'\d+', publishedDate) timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now()\ publishedDate = datetime.now()\
- timedelta(hours=int(timeNumbers[0])) - timedelta(hours=int(timeNumbers[0]))
elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate): elif re.match("^[0-9]+ hour(s|),"
" [0-9]+ minute(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate) timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now()\ publishedDate = datetime.now()\
- timedelta(hours=int(timeNumbers[0]))\ - timedelta(hours=int(timeNumbers[0]))\

View File

@ -22,10 +22,17 @@ api_key = None
# search-url # search-url
url = 'http://www.faroo.com/' url = 'http://www.faroo.com/'
search_url = url + 'api?{query}&start={offset}&length={number_of_results}&l={language}&src={categorie}&i=false&f=json&key={api_key}' search_url = url + 'api?{query}'\
'&start={offset}'\
'&length={number_of_results}'\
'&l={language}'\
'&src={categorie}'\
'&i=false'\
'&f=json'\
'&key={api_key}' # noqa
search_category = {'general': 'web', search_category = {'general': 'web',
'news': 'news'} 'news': 'news'}
# do search-request # do search-request
@ -80,8 +87,8 @@ def response(resp):
# parse results # parse results
for result in search_res['results']: for result in search_res['results']:
if result['news']: if result['news']:
# timestamp (how many milliseconds have passed between now and the beginning of 1970) # timestamp (milliseconds since 1970)
publishedDate = datetime.datetime.fromtimestamp(result['date']/1000.0) publishedDate = datetime.datetime.fromtimestamp(result['date']/1000.0) # noqa
# append news result # append news result
results.append({'url': result['url'], results.append({'url': result['url'],

View File

@ -9,7 +9,7 @@
# @stable yes (but deprecated) # @stable yes (but deprecated)
# @parse url, title, img_src # @parse url, title, img_src
from urllib import urlencode,unquote from urllib import urlencode, unquote
from json import loads from json import loads
# engine dependent config # engine dependent config

View File

@ -1,8 +1,8 @@
## Kickass Torrent (Videos, Music, Files) ## Kickass Torrent (Videos, Music, Files)
# #
# @website https://kickass.so # @website https://kickass.so
# @provide-api no (nothing found) # @provide-api no (nothing found)
# #
# @using-api no # @using-api no
# @results HTML (using search portal) # @results HTML (using search portal)
# @stable yes (HTML can change) # @stable yes (HTML can change)
@ -13,7 +13,6 @@ from cgi import escape
from urllib import quote from urllib import quote
from lxml import html from lxml import html
from operator import itemgetter from operator import itemgetter
from dateutil import parser
# engine dependent config # engine dependent config
categories = ['videos', 'music', 'files'] categories = ['videos', 'music', 'files']
@ -33,7 +32,8 @@ def request(query, params):
params['url'] = search_url.format(search_term=quote(query), params['url'] = search_url.format(search_term=quote(query),
pageno=params['pageno']) pageno=params['pageno'])
# FIX: SSLError: hostname 'kickass.so' doesn't match either of '*.kickass.to', 'kickass.to' # FIX: SSLError: hostname 'kickass.so'
# doesn't match either of '*.kickass.to', 'kickass.to'
params['verify'] = False params['verify'] = False
return params return params

View File

@ -28,15 +28,17 @@ search_url = base_url + 'w/api.php?action=query'\
'&srprop=timestamp'\ '&srprop=timestamp'\
'&format=json'\ '&format=json'\
'&sroffset={offset}'\ '&sroffset={offset}'\
'&srlimit={limit}' '&srlimit={limit}' # noqa
# do search-request # do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * number_of_results offset = (params['pageno'] - 1) * number_of_results
string_args = dict(query=urlencode({'srsearch': query}), string_args = dict(query=urlencode({'srsearch': query}),
offset=offset, offset=offset,
limit=number_of_results) limit=number_of_results)
format_strings = list(Formatter().parse(base_url)) format_strings = list(Formatter().parse(base_url))
if params['language'] == 'all': if params['language'] == 'all':
@ -67,7 +69,8 @@ def response(resp):
# parse results # parse results
for result in search_results['query']['search']: for result in search_results['query']['search']:
url = base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8')) url = base_url.format(language=resp.search_params['language']) +\
'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8'))
# append result # append result
results.append({'url': url, results.append({'url': url,

View File

@ -9,20 +9,24 @@
# @parse url, title # @parse url, title
from json import loads from json import loads
from searx.utils import searx_useragent
# engine dependent config # engine dependent config
categories = ['map'] categories = ['map']
paging = False paging = False
# search-url # search-url
url = 'https://nominatim.openstreetmap.org/search/{query}?format=json&polygon_geojson=1&addressdetails=1' base_url = 'https://nominatim.openstreetmap.org/'
search_string = 'search/{query}?format=json&polygon_geojson=1&addressdetails=1'
result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}' result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}'
# do search-request # do search-request
def request(query, params): def request(query, params):
params['url'] = url.format(query=query) params['url'] = base_url + search_string.format(query=query)
# using searx User-Agent
params['headers']['User-Agent'] = searx_useragent()
return params return params
@ -68,8 +72,8 @@ def response(resp):
address.update({'house_number': address_raw.get('house_number'), address.update({'house_number': address_raw.get('house_number'),
'road': address_raw.get('road'), 'road': address_raw.get('road'),
'locality': address_raw.get('city', 'locality': address_raw.get('city',
address_raw.get('town', address_raw.get('town', # noqa
address_raw.get('village'))), address_raw.get('village'))), # noqa
'postcode': address_raw.get('postcode'), 'postcode': address_raw.get('postcode'),
'country': address_raw.get('country'), 'country': address_raw.get('country'),
'country_code': address_raw.get('country_code')}) 'country_code': address_raw.get('country_code')})

128
searx/engines/photon.py Normal file
View File

@ -0,0 +1,128 @@
## Photon (Map)
#
# @website https://photon.komoot.de
# @provide-api yes (https://photon.komoot.de/)
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title
from urllib import urlencode
from json import loads
from searx.utils import searx_useragent
# engine dependent config
categories = ['map']
paging = False
language_support = True
number_of_results = 10
# search-url
base_url = 'https://photon.komoot.de/'
search_string = 'api/?{query}&limit={limit}'
result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}'
# do search-request
def request(query, params):
params['url'] = base_url +\
search_string.format(query=urlencode({'q': query}),
limit=number_of_results)
if params['language'] != 'all':
params['url'] = params['url'] +\
"&lang=" + params['language'].replace('_', '-')
# using searx User-Agent
params['headers']['User-Agent'] = searx_useragent()
# FIX: SSLError: SSL3_GET_SERVER_CERTIFICATE:certificate verify failed
params['verify'] = False
return params
# get response from search-request
def response(resp):
results = []
json = loads(resp.text)
# parse results
for r in json.get('features', {}):
properties = r.get('properties')
if not properties:
continue
# get title
title = properties['name']
# get osm-type
if properties.get('osm_type') == 'N':
osm_type = 'node'
elif properties.get('osm_type') == 'W':
osm_type = 'way'
elif properties.get('osm_type') == 'R':
osm_type = 'relation'
else:
# continue if invalide osm-type
continue
url = result_base_url.format(osm_type=osm_type,
osm_id=properties.get('osm_id'))
osm = {'type': osm_type,
'id': properties.get('osm_id')}
geojson = r.get('geometry')
if properties.get('extent'):
boundingbox = [properties.get('extent')[3],
properties.get('extent')[1],
properties.get('extent')[0],
properties.get('extent')[2]]
else:
# TODO: better boundingbox calculation
boundingbox = [geojson['coordinates'][1],
geojson['coordinates'][1],
geojson['coordinates'][0],
geojson['coordinates'][0]]
# address calculation
address = {}
# get name
if properties.get('osm_key') == 'amenity' or\
properties.get('osm_key') == 'shop' or\
properties.get('osm_key') == 'tourism' or\
properties.get('osm_key') == 'leisure':
address = {'name': properties.get('name')}
# add rest of adressdata, if something is already found
if address.get('name'):
address.update({'house_number': properties.get('housenumber'),
'road': properties.get('street'),
'locality': properties.get('city',
properties.get('town', # noqa
properties.get('village'))), # noqa
'postcode': properties.get('postcode'),
'country': properties.get('country')})
else:
address = None
# append result
results.append({'template': 'map.html',
'title': title,
'content': '',
'longitude': geojson['coordinates'][0],
'latitude': geojson['coordinates'][1],
'boundingbox': boundingbox,
'geojson': geojson,
'address': address,
'osm': osm,
'url': url})
# return results
return results

View File

@ -20,7 +20,12 @@ guest_client_id = 'b45b1aa10f1ac2941910a7f0d10f8e28'
# search-url # search-url
url = 'https://api.soundcloud.com/' url = 'https://api.soundcloud.com/'
search_url = url + 'search?{query}&facet=model&limit=20&offset={offset}&linked_partitioning=1&client_id={client_id}' search_url = url + 'search?{query}'\
'&facet=model'\
'&limit=20'\
'&offset={offset}'\
'&linked_partitioning=1'\
'&client_id={client_id}' # noqa
# do search-request # do search-request

View File

@ -24,7 +24,11 @@ number_of_results = 5
# search-url # search-url
base_url = 'http://localhost:8090' base_url = 'http://localhost:8090'
search_url = '/yacysearch.json?{query}&startRecord={offset}&maximumRecords={limit}&contentdom={search_type}&resource=global' search_url = '/yacysearch.json?{query}'\
'&startRecord={offset}'\
'&maximumRecords={limit}'\
'&contentdom={search_type}'\
'&resource=global' # noqa
# yacy specific type-definitions # yacy specific type-definitions
search_types = {'general': 'text', search_types = {'general': 'text',
@ -39,10 +43,11 @@ def request(query, params):
offset = (params['pageno'] - 1) * number_of_results offset = (params['pageno'] - 1) * number_of_results
search_type = search_types.get(params['category'], '0') search_type = search_types.get(params['category'], '0')
params['url'] = base_url + search_url.format(query=urlencode({'query': query}), params['url'] = base_url +\
offset=offset, search_url.format(query=urlencode({'query': query}),
limit=number_of_results, offset=offset,
search_type=search_type) limit=number_of_results,
search_type=search_type)
# add language tag if specified # add language tag if specified
if params['language'] != 'all': if params['language'] != 'all':
@ -70,19 +75,19 @@ def response(resp):
# append result # append result
results.append({'url': result['link'], results.append({'url': result['link'],
'title': result['title'], 'title': result['title'],
'content': result['description'], 'content': result['description'],
'publishedDate': publishedDate}) 'publishedDate': publishedDate})
elif resp.search_params['category'] == 'images': elif resp.search_params['category'] == 'images':
# parse image results # parse image results
for result in search_results: for result in search_results:
# append result # append result
results.append({'url': result['url'], results.append({'url': result['url'],
'title': result['title'], 'title': result['title'],
'content': '', 'content': '',
'img_src': result['image'], 'img_src': result['image'],
'template': 'images.html'}) 'template': 'images.html'})
#TODO parse video, audio and file results #TODO parse video, audio and file results

View File

@ -20,7 +20,8 @@ paging = True
language_support = True language_support = True
# search-url # search-url
search_url = 'https://search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}' base_url = 'https://search.yahoo.com/'
search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
# specific xpath variables # specific xpath variables
results_xpath = '//div[@class="res"]' results_xpath = '//div[@class="res"]'
@ -57,9 +58,9 @@ def request(query, params):
else: else:
language = params['language'].split('_')[0] language = params['language'].split('_')[0]
params['url'] = search_url.format(offset=offset, params['url'] = base_url + search_url.format(offset=offset,
query=urlencode({'p': query}), query=urlencode({'p': query}),
lang=language) lang=language)
# TODO required? # TODO required?
params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\ params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\

View File

@ -95,6 +95,10 @@ engines:
engine : openstreetmap engine : openstreetmap
shortcut : osm shortcut : osm
- name : photon
engine : photon
shortcut : ph
# - name : piratebay # - name : piratebay
# engine : piratebay # engine : piratebay
# shortcut : tpb # shortcut : tpb