[enh] fix content fetching, parse published date from description

This commit is contained in:
Thomas Pointhuber 2015-10-24 16:15:30 +02:00
parent a959977ab4
commit 4508c96667
2 changed files with 40 additions and 9 deletions

View file

@ -12,6 +12,8 @@
from lxml import html
from cgi import escape
from dateutil import parser
from datetime import datetime, timedelta
import re
from searx.engines.xpath import extract_text
@ -79,11 +81,40 @@ def response(resp):
title = escape(extract_text(link))
if result.xpath('./p[@class="desc"]'):
content = escape(extract_text(result.xpath('./p[@class="desc"]')))
if result.xpath('./p[@class="desc clk"]'):
content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
else:
content = ''
published_date = None
# check if search result starts with something like: "2 Sep 2014 ... "
if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
date_pos = content.find('...')+4
date_string = content[0:date_pos-5]
published_date = parser.parse(date_string, dayfirst=True)
# fix content string
content = content[date_pos:]
# check if search result starts with something like: "5 days ago ... "
elif re.match("^[0-9]+ days? ago \.\.\. ", content):
date_pos = content.find('...')+4
date_string = content[0:date_pos-5]
# calculate datetime
published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
# fix content string
content = content[date_pos:]
if published_date:
# append result
results.append({'url': url,
'title': title,
'content': content,
'publishedDate': published_date})
else:
# append result
results.append({'url': url,
'title': title,

View file

@ -42,7 +42,7 @@ class TestStartpageEngine(SearxTestCase):
</a>
<span id='title_stars_2' name='title_stars_2'> </span>
</h3>
<p class='desc'>
<p class='desc clk'>
This should be the content.
</p>
<p>
@ -78,7 +78,7 @@ class TestStartpageEngine(SearxTestCase):
</a>
<span id='title_stars_2' name='title_stars_2'> </span>
</h3>
<p class='desc'>
<p class='desc clk'>
This should be the content.
</p>
<p>
@ -101,7 +101,7 @@ class TestStartpageEngine(SearxTestCase):
<h3>
<span id='title_stars_2' name='title_stars_2'> </span>
</h3>
<p class='desc'>
<p class='desc clk'>
This should be the content.
</p>
<p>