Merge pull request #452 from pointhi/engine_fix

[enh] fix content fetching, parse published date from description for startpage and ixquick
This commit is contained in:
Adam Tauber 2015-10-26 09:29:20 +01:00
commit 3a2f29344a
2 changed files with 40 additions and 9 deletions

View File

@ -12,6 +12,8 @@
from lxml import html from lxml import html
from cgi import escape from cgi import escape
from dateutil import parser
from datetime import datetime, timedelta
import re import re
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
@ -79,11 +81,40 @@ def response(resp):
title = escape(extract_text(link)) title = escape(extract_text(link))
if result.xpath('./p[@class="desc"]'): if result.xpath('./p[@class="desc clk"]'):
content = escape(extract_text(result.xpath('./p[@class="desc"]'))) content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
else: else:
content = '' content = ''
published_date = None
# check if search result starts with something like: "2 Sep 2014 ... "
if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
date_pos = content.find('...')+4
date_string = content[0:date_pos-5]
published_date = parser.parse(date_string, dayfirst=True)
# fix content string
content = content[date_pos:]
# check if search result starts with something like: "5 days ago ... "
elif re.match("^[0-9]+ days? ago \.\.\. ", content):
date_pos = content.find('...')+4
date_string = content[0:date_pos-5]
# calculate datetime
published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
# fix content string
content = content[date_pos:]
if published_date:
# append result
results.append({'url': url,
'title': title,
'content': content,
'publishedDate': published_date})
else:
# append result # append result
results.append({'url': url, results.append({'url': url,
'title': title, 'title': title,

View File

@ -42,7 +42,7 @@ class TestStartpageEngine(SearxTestCase):
</a> </a>
<span id='title_stars_2' name='title_stars_2'> </span> <span id='title_stars_2' name='title_stars_2'> </span>
</h3> </h3>
<p class='desc'> <p class='desc clk'>
This should be the content. This should be the content.
</p> </p>
<p> <p>
@ -78,7 +78,7 @@ class TestStartpageEngine(SearxTestCase):
</a> </a>
<span id='title_stars_2' name='title_stars_2'> </span> <span id='title_stars_2' name='title_stars_2'> </span>
</h3> </h3>
<p class='desc'> <p class='desc clk'>
This should be the content. This should be the content.
</p> </p>
<p> <p>
@ -101,7 +101,7 @@ class TestStartpageEngine(SearxTestCase):
<h3> <h3>
<span id='title_stars_2' name='title_stars_2'> </span> <span id='title_stars_2' name='title_stars_2'> </span>
</h3> </h3>
<p class='desc'> <p class='desc clk'>
This should be the content. This should be the content.
</p> </p>
<p> <p>