Merge pull request #165 from Cqoicebordel/Moar-engines

Moar engines
This commit is contained in:
Adam Tauber 2015-01-01 14:10:59 +01:00
commit 469e08881e
14 changed files with 191 additions and 51 deletions

View File

@ -81,7 +81,7 @@ def load_engine(engine_data):
if engine_attr.startswith('_'): if engine_attr.startswith('_'):
continue continue
if getattr(engine, engine_attr) is None: if getattr(engine, engine_attr) is None:
print('[E] Engine config error: Missing attribute "{0}.{1}"'\ print('[E] Engine config error: Missing attribute "{0}.{1}"'
.format(engine.name, engine_attr)) .format(engine.name, engine_attr))
sys.exit(1) sys.exit(1)
@ -102,7 +102,7 @@ def load_engine(engine_data):
if engine.shortcut: if engine.shortcut:
# TODO check duplications # TODO check duplications
if engine.shortcut in engine_shortcuts: if engine.shortcut in engine_shortcuts:
print('[E] Engine config error: ambigious shortcut: {0}'\ print('[E] Engine config error: ambigious shortcut: {0}'
.format(engine.shortcut)) .format(engine.shortcut))
sys.exit(1) sys.exit(1)
engine_shortcuts[engine.shortcut] = engine.name engine_shortcuts[engine.shortcut] = engine.name

67
searx/engines/digg.py Normal file
View File

@ -0,0 +1,67 @@
## Digg (News, Social media)
#
# @website https://digg.com/
# @provide-api no
#
# @using-api no
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content, publishedDate, thumbnail
from urllib import quote_plus
from json import loads
from lxml import html
from cgi import escape
from dateutil import parser
# engine dependent config
categories = ['news', 'social media']
paging = True
# search-url
base_url = 'https://digg.com/'
search_url = base_url+'api/search/{query}.json?position={position}&format=html'
# specific xpath variables
results_xpath = '//article'
link_xpath = './/small[@class="time"]//a'
title_xpath = './/h2//a//text()'
content_xpath = './/p//text()'
pubdate_xpath = './/time'
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10
params['url'] = search_url.format(position=offset,
query=quote_plus(query))
return params
# get response from search-request
def response(resp):
results = []
search_result = loads(resp.text)
dom = html.fromstring(search_result['html'])
# parse results
for result in dom.xpath(results_xpath):
url = result.attrib.get('data-contenturl')
thumbnail = result.xpath('.//img')[0].attrib.get('src')
title = ''.join(result.xpath(title_xpath))
content = escape(''.join(result.xpath(content_xpath)))
pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime')
publishedDate = parser.parse(pubdate)
# append result
results.append({'url': url,
'title': title,
'content': content,
'template': 'videos.html',
'publishedDate': publishedDate,
'thumbnail': thumbnail})
# return results
return results

View File

@ -53,7 +53,8 @@ def response(resp):
for photo in photos: for photo in photos:
# In paged configuration, the first pages' photos are represented by a None object # In paged configuration, the first pages' photos
# are represented by a None object
if photo is None: if photo is None:
continue continue
@ -74,10 +75,15 @@ def response(resp):
title = photo['title'] title = photo['title']
content = '<span class="photo-author">' + photo['owner']['username'] + '</span><br />' content = '<span class="photo-author">' +\
photo['owner']['username'] +\
'</span><br />'
if 'description' in photo: if 'description' in photo:
content = content + '<span class="description">' + photo['description'] + '</span>' content = content +\
'<span class="description">' +\
photo['description'] +\
'</span>'
# append result # append result
results.append({'url': url, results.append({'url': url,

View File

@ -18,16 +18,20 @@ categories = ['images']
nb_per_page = 15 nb_per_page = 15
paging = True paging = True
api_key= None api_key = None
url = 'https://api.flickr.com/services/rest/?method=flickr.photos.search&api_key={api_key}&{text}&sort=relevance&extras=description%2C+owner_name%2C+url_o%2C+url_z&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}' url = 'https://api.flickr.com/services/rest/?method=flickr.photos.search' +\
'&api_key={api_key}&{text}&sort=relevance' +\
'&extras=description%2C+owner_name%2C+url_o%2C+url_z' +\
'&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}'
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}' photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
paging = True paging = True
def build_flickr_url(user_id, photo_id): def build_flickr_url(user_id, photo_id):
return photo_url.format(userid=user_id,photoid=photo_id) return photo_url.format(userid=user_id, photoid=photo_id)
def request(query, params): def request(query, params):
@ -65,9 +69,12 @@ def response(resp):
title = photo['title'] title = photo['title']
content = '<span class="photo-author">'+ photo['ownername'] +'</span><br />' content = '<span class="photo-author">' +\
photo['ownername'] +\
content = content + '<span class="description">' + photo['description']['_content'] + '</span>' '</span><br />' +\
'<span class="description">' +\
photo['description']['_content'] +\
'</span>'
# append result # append result
results.append({'url': url, results.append({'url': url,

View File

@ -24,7 +24,7 @@ search_url = url + 'search/{search_term}/{pageno}/'
# specific xpath variables # specific xpath variables
magnet_xpath = './/a[@title="Torrent magnet link"]' magnet_xpath = './/a[@title="Torrent magnet link"]'
#content_xpath = './/font[@class="detDesc"]//text()' content_xpath = './/span[@class="font11px lightgrey block"]'
# do search-request # do search-request
@ -56,7 +56,8 @@ def response(resp):
link = result.xpath('.//a[@class="cellMainLink"]')[0] link = result.xpath('.//a[@class="cellMainLink"]')[0]
href = urljoin(url, link.attrib['href']) href = urljoin(url, link.attrib['href'])
title = ' '.join(link.xpath('.//text()')) title = ' '.join(link.xpath('.//text()'))
content = escape(html.tostring(result.xpath('.//span[@class="font11px lightgrey block"]')[0], method="text")) content = escape(html.tostring(result.xpath(content_xpath)[0],
method="text"))
seed = result.xpath('.//td[contains(@class, "green")]/text()')[0] seed = result.xpath('.//td[contains(@class, "green")]/text()')[0]
leech = result.xpath('.//td[contains(@class, "red")]/text()')[0] leech = result.xpath('.//td[contains(@class, "red")]/text()')[0]

View File

@ -11,7 +11,6 @@
from urllib import urlencode from urllib import urlencode
from json import loads from json import loads
import cgi import cgi
import re
# engine dependent config # engine dependent config
categories = ['it'] categories = ['it']
@ -50,7 +49,8 @@ def response(resp):
for line, code in sorted(lines.items()): for line, code in sorted(lines.items()):
content = content + '<tr><td class="line-number" style="padding-right:5px;">' content = content + '<tr><td class="line-number" style="padding-right:5px;">'
content = content + str(line) + '</td><td class="code-snippet">' content = content + str(line) + '</td><td class="code-snippet">'
# Replace every two spaces with ' &nbps;' to keep formatting while allowing the browser to break the line if necessary # Replace every two spaces with ' &nbps;' to keep formatting
# while allowing the browser to break the line if necessary
content = content + cgi.escape(code).replace('\t', ' ').replace(' ', '&nbsp; ').replace(' ', ' &nbsp;') content = content + cgi.escape(code).replace('\t', ' ').replace(' ', '&nbsp; ').replace(' ', ' &nbsp;')
content = content + "</td></tr>" content = content + "</td></tr>"

View File

@ -37,8 +37,15 @@ def response(resp):
# parse results # parse results
for result in search_results['results']: for result in search_results['results']:
href = result['url'] href = result['url']
title = "[" + result['type'] + "] " + result['namespace'] + " " + result['name'] title = "[" + result['type'] + "] " +\
content = '<span class="highlight">[' + result['type'] + "] " + result['name'] + " " + result['synopsis'] + "</span><br />" + result['description'] result['namespace'] +\
" " + result['name']
content = '<span class="highlight">[' +\
result['type'] + "] " +\
result['name'] + " " +\
result['synopsis'] +\
"</span><br />" +\
result['description']
# append result # append result
results.append({'url': href, results.append({'url': href,

View File

@ -60,10 +60,14 @@ def response(resp):
content = result.xpath('.//div[contains(@class,"red")]//text()')[0] content = result.xpath('.//div[contains(@class,"red")]//text()')[0]
content = content + " - " content = content + " - "
content = content + html.tostring(result.xpath('.//div[contains(@class,"grey-web")]')[0], method='text') text = result.xpath('.//div[contains(@class,"grey-web")]')[0]
content = content + html.tostring(text, method='text')
if result.xpath(".//span") != []: if result.xpath(".//span") != []:
content = content + " - (" + result.xpath(".//span//text()")[0].strip() + ")" content = content +\
" - (" +\
result.xpath(".//span//text()")[0].strip() +\
")"
# append result # append result
results.append({'url': href, results.append({'url': href,

View File

@ -1,6 +1,6 @@
## Twitter (Social media) ## Twitter (Social media)
# #
# @website https://www.bing.com/news # @website https://twitter.com/
# @provide-api yes (https://dev.twitter.com/docs/using-search) # @provide-api yes (https://dev.twitter.com/docs/using-search)
# #
# @using-api no # @using-api no
@ -14,6 +14,7 @@ from urlparse import urljoin
from urllib import urlencode from urllib import urlencode
from lxml import html from lxml import html
from cgi import escape from cgi import escape
from datetime import datetime
# engine dependent config # engine dependent config
categories = ['social media'] categories = ['social media']
@ -27,7 +28,8 @@ search_url = base_url+'search?'
results_xpath = '//li[@data-item-type="tweet"]' results_xpath = '//li[@data-item-type="tweet"]'
link_xpath = './/small[@class="time"]//a' link_xpath = './/small[@class="time"]//a'
title_xpath = './/span[@class="username js-action-profile-name"]//text()' title_xpath = './/span[@class="username js-action-profile-name"]//text()'
content_xpath = './/p[@class="js-tweet-text tweet-text"]//text()' content_xpath = './/p[@class="js-tweet-text tweet-text"]'
timestamp_xpath = './/span[contains(@class,"_timestamp")]'
# do search-request # do search-request
@ -52,8 +54,17 @@ def response(resp):
link = tweet.xpath(link_xpath)[0] link = tweet.xpath(link_xpath)[0]
url = urljoin(base_url, link.attrib.get('href')) url = urljoin(base_url, link.attrib.get('href'))
title = ''.join(tweet.xpath(title_xpath)) title = ''.join(tweet.xpath(title_xpath))
content = escape(''.join(tweet.xpath(content_xpath))) content = escape(html.tostring(tweet.xpath(content_xpath)[0], method='text', encoding='UTF-8').decode("utf-8"))
pubdate = tweet.xpath(timestamp_xpath)
if len(pubdate) > 0:
timestamp = float(pubdate[0].attrib.get('data-time'))
publishedDate = datetime.fromtimestamp(timestamp, None)
# append result
results.append({'url': url,
'title': title,
'content': content,
'publishedDate': publishedDate})
else:
# append result # append result
results.append({'url': url, results.append({'url': url,
'title': title, 'title': title,

View File

@ -154,7 +154,6 @@ def load_https_rules(rules_path):
print(' * {n} https-rules loaded'.format(n=len(https_rules))) print(' * {n} https-rules loaded'.format(n=len(https_rules)))
def https_url_rewrite(result): def https_url_rewrite(result):
skip_https_rewrite = False skip_https_rewrite = False
# check if HTTPS rewrite is possible # check if HTTPS rewrite is possible

View File

@ -69,11 +69,16 @@ def threaded_requests(requests):
print('engine timeout: {0}'.format(th._engine_name)) print('engine timeout: {0}'.format(th._engine_name))
# get default reqest parameter # get default reqest parameter
def default_request_params(): def default_request_params():
return { return {
'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}, 'verify': True} 'method': 'GET',
'headers': {},
'data': {},
'url': '',
'cookies': {},
'verify': True
}
# create a callback wrapper for the search engine results # create a callback wrapper for the search engine results
@ -487,14 +492,15 @@ class Search(object):
continue continue
# append request to list # append request to list
requests.append((req, request_params['url'], request_args, selected_engine['name'])) requests.append((req, request_params['url'],
request_args,
selected_engine['name']))
if not requests: if not requests:
return results, suggestions, answers, infoboxes return results, suggestions, answers, infoboxes
# send all search-request # send all search-request
threaded_requests(requests) threaded_requests(requests)
while not results_queue.empty(): while not results_queue.empty():
engine_name, engine_results = results_queue.get_nowait() engine_name, engine_results = results_queue.get_nowait()

View File

@ -45,6 +45,10 @@ engines:
engine : duckduckgo_definitions engine : duckduckgo_definitions
shortcut : ddd shortcut : ddd
- name : digg
engine : digg
shortcut : dg
- name : wikidata - name : wikidata
engine : wikidata engine : wikidata
shortcut : wd shortcut : wd
@ -99,6 +103,33 @@ engines:
engine : google_news engine : google_news
shortcut : gon shortcut : gon
- name : google play apps
engine : xpath
search_url : https://play.google.com/store/search?q={query}&c=apps
url_xpath : //a[@class="title"]/@href
title_xpath : //a[@class="title"]
content_xpath : //a[@class="subtitle"]
categories : files
shortcut : gpa
- name : google play movies
engine : xpath
search_url : https://play.google.com/store/search?q={query}&c=movies
url_xpath : //a[@class="title"]/@href
title_xpath : //a[@class="title"]
content_xpath : //a[@class="subtitle"]
categories : videos
shortcut : gpm
- name : google play music
engine : xpath
search_url : https://play.google.com/store/search?q={query}&c=music
url_xpath : //a[@class="title"]/@href
title_xpath : //a[@class="title"]
content_xpath : //a[@class="subtitle"]
categories : music
shortcut : gps
- name : openstreetmap - name : openstreetmap
engine : openstreetmap engine : openstreetmap
shortcut : osm shortcut : osm

View File

@ -30,7 +30,8 @@ def gen_useragent():
def searx_useragent(): def searx_useragent():
return 'searx/{searx_version} {suffix}'.format(searx_version=VERSION_STRING, return 'searx/{searx_version} {suffix}'.format(
searx_version=VERSION_STRING,
suffix=settings['server'].get('useragent_suffix', '')) suffix=settings['server'].get('useragent_suffix', ''))