From b8fc531b60221756446d50b1055161ec6dd1c34c Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Fri, 5 Jun 2015 11:23:24 +0200 Subject: [PATCH 1/3] [enh] google engine : parse map links and more --- searx/engines/google.py | 180 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 166 insertions(+), 14 deletions(-) diff --git a/searx/engines/google.py b/searx/engines/google.py index 785cd5e6..3684a9e6 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -8,6 +8,7 @@ # @stable no (HTML can change) # @parse url, title, content, suggestion +import re from urllib import urlencode from urlparse import urlparse, parse_qsl from lxml import html @@ -78,15 +79,22 @@ country_to_hostname = { 'TW': 'www.google.com.tw' # Taiwan } +# osm +url_map = 'https://www.openstreetmap.org/'\ + + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M' + # search-url search_path = '/search' -maps_path = '/maps/' -redirect_path = '/url' -images_path = '/images' search_url = ('https://{hostname}' + search_path + '?{query}&start={offset}&gbv=1') +# other URLs +map_hostname_start = 'maps.google.' +maps_path = '/maps' +redirect_path = '/url' +images_path = '/images' + # specific xpath variables results_xpath = '//li[@class="g"]' url_xpath = './/h3/a/@href' @@ -95,10 +103,32 @@ content_xpath = './/span[@class="st"]' content_misc_xpath = './/div[@class="f slp"]' suggestion_xpath = '//p[@class="_Bmc"]' +# map : detail location +map_address_xpath = './/div[@class="s"]//table//td[2]/span/text()' +map_phone_xpath = './/div[@class="s"]//table//td[2]/span/span' +map_website_url_xpath = 'h3[2]/a/@href' +map_website_title_xpath = 'h3[2]' + +# map : near the location +map_near = 'table[@class="ts"]//tr' +map_near_title = './/h4' +map_near_url = './/h4/a/@href' +map_near_phone = './/span[@class="nobr"]' + +# images images_xpath = './/div/a' image_url_xpath = './@href' image_img_src_xpath = './img/@src' +# property names +# FIXME : no translation +property_address = "Address" +property_phone = "Phone number" +property_location = "Location" +property_website = "Web site" +property_gplus_website = "Google plus" + +# cookies pref_cookie = '' nid_cookie = {} @@ -122,6 +152,11 @@ def get_google_nid_cookie(google_hostname): # remove google-specific tracking-url def parse_url(url_string, google_hostname): + # sanity check + if url_string is None: + return url_string + + # normal case parsed_url = urlparse(url_string) if (parsed_url.netloc in [google_hostname, ''] and parsed_url.path == redirect_path): @@ -131,6 +166,19 @@ def parse_url(url_string, google_hostname): return url_string +# URL : get label +def url_get_label(url_string): + # sanity check + if url_string is None: + return url_string + + # normal case + parsed_url = urlparse(url_string) + if parsed_url.netloc == 'plus.google.com': + return property_gplus_website + return property_website + + # returns extract_text on the first result selected by the xpath or None def extract_text_from_dom(result, xpath): r = result.xpath(xpath) @@ -151,7 +199,7 @@ def request(query, params): if len(language_array) == 2: country = language_array[1] else: - country = ' ' + country = 'US' language = language_array[0] + ',' + language_array[0] + '-' + country if use_locale_domain: @@ -196,21 +244,32 @@ def response(resp): try: url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname) parsed_url = urlparse(url, google_hostname) - if (parsed_url.netloc == google_hostname - and (parsed_url.path == search_path - or parsed_url.path.startswith(maps_path))): - # remove the link to google news and google maps - # FIXME : sometimes the URL is https://maps.google.*/maps - # no consequence, the result trigger an exception after which is ignored - continue + + # map result + if ((parsed_url.netloc == google_hostname and parsed_url.path.startswith(maps_path)) + or (parsed_url.netloc.startswith(map_hostname_start))): + x = result.xpath(map_near) + if len(x) > 0: + # map : near the location + results = results + parse_map_near(parsed_url, x, google_hostname) + else: + # map : detail about a location + results = results + parse_map_detail(parsed_url, result, google_hostname) + + # google news + elif (parsed_url.netloc == google_hostname + and parsed_url.path == search_path): + # skipping news results + pass # images result - if (parsed_url.netloc == google_hostname - and parsed_url.path == images_path): + elif (parsed_url.netloc == google_hostname + and parsed_url.path == images_path): # only thumbnail image provided, # so skipping image results # results = results + parse_images(result, google_hostname) pass + else: # normal result content = extract_text_from_dom(result, content_xpath) @@ -223,7 +282,7 @@ def response(resp): results.append({'url': url, 'title': title, 'content': content}) - except Exception: + except: continue # parse suggestion @@ -249,3 +308,96 @@ def parse_images(result, google_hostname): 'template': 'images.html'}) return results + + +def parse_map_near(parsed_url, x, google_hostname): + results = [] + + for result in x: + title = extract_text_from_dom(result, map_near_title) + url = parse_url(extract_text_from_dom(result, map_near_url), google_hostname) + phone = extract_text_from_dom(result, map_near_phone) + if phone is not None: + phone = property_phone + ": " + phone + results.append({'url': url, + 'title': title, + 'content': phone}) + + return results + + +def parse_map_detail(parsed_url, result, google_hostname): + results = [] + + # try to parse the geoloc + m = re.search('@([0-9\.]+),([0-9\.]+),([0-9]+)', parsed_url.path) + if m is None: + m = re.search('ll\=([0-9\.]+),([0-9\.]+)\&z\=([0-9]+)', parsed_url.query) + + if m is not None: + # geoloc found + lon = float(m.group(2)) + lat = float(m.group(1)) + zoom = int(m.group(3)) + + # TODO : map zoom to dlon / dlat + dlon = 0.000001 + dlat = 0.000001 + + boundingbox = [round(lat - dlat, 7), round(lat + dlat, 7), round(lon - dlon, 7), round(lon + dlon, 7)] + map_url = url_map\ + .replace('{latitude}', str(lat))\ + .replace('{longitude}', str(lon))\ + .replace('{zoom}', str(zoom+2)) + + geojson = {u'type': u'Point', + u'coordinates': [lon, lat] + } + + # attributes + attributes = [] + add_attributes(attributes, property_address, extract_text_from_dom(result, map_address_xpath)) + add_attributes(attributes, property_phone, extract_text_from_dom(result, map_phone_xpath)) + + # title / content / url + website_title = extract_text_from_dom(result, map_website_title_xpath) + content = extract_text_from_dom(result, content_xpath) + website_url = parse_url(extract_text_from_dom(result, map_website_url_xpath), google_hostname) + + # add an infobox if there is a website + if website_url is not None: + results.append({'infobox': website_title, + 'id': website_url, + 'content': content, + 'attributes': attributes, + 'urls': [ + {'title': url_get_label(website_url), 'url': website_url}, + {'title': property_location, 'url': map_url} + ] + }) + + # usefull because user can see the map directly into searx + results.append({'template': 'map.html', + 'title': website_title, + 'content': (content + '
' if content is not None else '') + + attributes_to_html(attributes), + 'longitude': lon, + 'latitude': lat, + 'boundingbox': boundingbox, + 'geojson': geojson, + 'url': website_url if website_url is not None else map_url + }) + return results + + +def add_attributes(attributes, name, value): + if value is not None and len(value) > 0: + attributes.append({'label': name, 'value': value}) + + +def attributes_to_html(attributes): + retval = '' + for a in attributes: + retval = retval + '' + retval = retval + '
' + a.get('label') + '' + a.get('value') + '
' + return retval From 72c8de35a29f3c58785282d8ca23e33506e01122 Mon Sep 17 00:00:00 2001 From: Dalf Date: Fri, 5 Jun 2015 23:56:23 +0200 Subject: [PATCH 2/3] google engine :remove OSM map --- searx/engines/google.py | 79 +++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 50 deletions(-) diff --git a/searx/engines/google.py b/searx/engines/google.py index 3684a9e6..6b822b92 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -282,7 +282,8 @@ def response(resp): results.append({'url': url, 'title': title, 'content': content}) - except: + except Exception, e: + print e continue # parse suggestion @@ -305,7 +306,8 @@ def parse_images(result, google_hostname): 'title': '', 'content': '', 'img_src': img_src, - 'template': 'images.html'}) + 'template': 'images.html' + }) return results @@ -316,12 +318,13 @@ def parse_map_near(parsed_url, x, google_hostname): for result in x: title = extract_text_from_dom(result, map_near_title) url = parse_url(extract_text_from_dom(result, map_near_url), google_hostname) + attributes = [] phone = extract_text_from_dom(result, map_near_phone) - if phone is not None: - phone = property_phone + ": " + phone - results.append({'url': url, - 'title': title, - 'content': phone}) + add_attributes(attributes, property_phone, phone, 'tel:' + phone) + results.append({'title': title, + 'url': url, + 'content': attributes_to_html(attributes) + }) return results @@ -335,69 +338,45 @@ def parse_map_detail(parsed_url, result, google_hostname): m = re.search('ll\=([0-9\.]+),([0-9\.]+)\&z\=([0-9]+)', parsed_url.query) if m is not None: - # geoloc found - lon = float(m.group(2)) - lat = float(m.group(1)) - zoom = int(m.group(3)) - - # TODO : map zoom to dlon / dlat - dlon = 0.000001 - dlat = 0.000001 - - boundingbox = [round(lat - dlat, 7), round(lat + dlat, 7), round(lon - dlon, 7), round(lon + dlon, 7)] - map_url = url_map\ - .replace('{latitude}', str(lat))\ - .replace('{longitude}', str(lon))\ - .replace('{zoom}', str(zoom+2)) - - geojson = {u'type': u'Point', - u'coordinates': [lon, lat] - } + # geoloc found (ignored) + lon = float(m.group(2)) # noqa + lat = float(m.group(1)) # noqa + zoom = int(m.group(3)) # noqa # attributes attributes = [] - add_attributes(attributes, property_address, extract_text_from_dom(result, map_address_xpath)) - add_attributes(attributes, property_phone, extract_text_from_dom(result, map_phone_xpath)) + address = extract_text_from_dom(result, map_address_xpath) + phone = extract_text_from_dom(result, map_phone_xpath) + add_attributes(attributes, property_address, address, 'geo:' + str(lat) + ',' + str(lon)) + add_attributes(attributes, property_phone, phone, 'tel:' + phone) # title / content / url website_title = extract_text_from_dom(result, map_website_title_xpath) content = extract_text_from_dom(result, content_xpath) website_url = parse_url(extract_text_from_dom(result, map_website_url_xpath), google_hostname) - # add an infobox if there is a website + # add a result if there is a website if website_url is not None: - results.append({'infobox': website_title, - 'id': website_url, - 'content': content, - 'attributes': attributes, - 'urls': [ - {'title': url_get_label(website_url), 'url': website_url}, - {'title': property_location, 'url': map_url} - ] + results.append({'title': website_title, + 'content': (content + '
' if content is not None else '') + + attributes_to_html(attributes), + 'url': website_url }) - # usefull because user can see the map directly into searx - results.append({'template': 'map.html', - 'title': website_title, - 'content': (content + '
' if content is not None else '') - + attributes_to_html(attributes), - 'longitude': lon, - 'latitude': lat, - 'boundingbox': boundingbox, - 'geojson': geojson, - 'url': website_url if website_url is not None else map_url - }) return results -def add_attributes(attributes, name, value): +def add_attributes(attributes, name, value, url): if value is not None and len(value) > 0: - attributes.append({'label': name, 'value': value}) + attributes.append({'label': name, 'value': value, 'url': url}) def attributes_to_html(attributes): retval = '' for a in attributes: - retval = retval + '' + value = a.get('value') + if 'url' in a: + value = '' + value + '' + retval = retval + '' retval = retval + '
' + a.get('label') + '' + a.get('value') + '
' + a.get('label') + '' + value + '
' return retval From fc0ae0f9073b36912a4302c6285f87331c045415 Mon Sep 17 00:00:00 2001 From: Dalf Date: Sat, 6 Jun 2015 00:18:00 +0200 Subject: [PATCH 3/3] google engine: code cleanup --- searx/engines/google.py | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/searx/engines/google.py b/searx/engines/google.py index 6b822b92..0e78a9e2 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -124,9 +124,6 @@ image_img_src_xpath = './img/@src' # FIXME : no translation property_address = "Address" property_phone = "Phone number" -property_location = "Location" -property_website = "Web site" -property_gplus_website = "Google plus" # cookies pref_cookie = '' @@ -166,19 +163,6 @@ def parse_url(url_string, google_hostname): return url_string -# URL : get label -def url_get_label(url_string): - # sanity check - if url_string is None: - return url_string - - # normal case - parsed_url = urlparse(url_string) - if parsed_url.netloc == 'plus.google.com': - return property_gplus_website - return property_website - - # returns extract_text on the first result selected by the xpath or None def extract_text_from_dom(result, xpath): r = result.xpath(xpath) @@ -281,9 +265,9 @@ def response(resp): # append result results.append({'url': url, 'title': title, - 'content': content}) - except Exception, e: - print e + 'content': content + }) + except: continue # parse suggestion