[enh] fix content fetching, parse published date from description

This commit is contained in:
Thomas Pointhuber 2015-10-24 16:15:30 +02:00
parent a959977ab4
commit 4508c96667
2 changed files with 40 additions and 9 deletions

View file

@ -12,6 +12,8 @@
from lxml import html from lxml import html
from cgi import escape from cgi import escape
from dateutil import parser
from datetime import datetime, timedelta
import re import re
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
@ -79,15 +81,44 @@ def response(resp):
title = escape(extract_text(link)) title = escape(extract_text(link))
if result.xpath('./p[@class="desc"]'): if result.xpath('./p[@class="desc clk"]'):
content = escape(extract_text(result.xpath('./p[@class="desc"]'))) content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
else: else:
content = '' content = ''
# append result published_date = None
results.append({'url': url,
'title': title, # check if search result starts with something like: "2 Sep 2014 ... "
'content': content}) if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
date_pos = content.find('...')+4
date_string = content[0:date_pos-5]
published_date = parser.parse(date_string, dayfirst=True)
# fix content string
content = content[date_pos:]
# check if search result starts with something like: "5 days ago ... "
elif re.match("^[0-9]+ days? ago \.\.\. ", content):
date_pos = content.find('...')+4
date_string = content[0:date_pos-5]
# calculate datetime
published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
# fix content string
content = content[date_pos:]
if published_date:
# append result
results.append({'url': url,
'title': title,
'content': content,
'publishedDate': published_date})
else:
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results # return results
return results return results

View file

@ -42,7 +42,7 @@ class TestStartpageEngine(SearxTestCase):
</a> </a>
<span id='title_stars_2' name='title_stars_2'> </span> <span id='title_stars_2' name='title_stars_2'> </span>
</h3> </h3>
<p class='desc'> <p class='desc clk'>
This should be the content. This should be the content.
</p> </p>
<p> <p>
@ -78,7 +78,7 @@ class TestStartpageEngine(SearxTestCase):
</a> </a>
<span id='title_stars_2' name='title_stars_2'> </span> <span id='title_stars_2' name='title_stars_2'> </span>
</h3> </h3>
<p class='desc'> <p class='desc clk'>
This should be the content. This should be the content.
</p> </p>
<p> <p>
@ -101,7 +101,7 @@ class TestStartpageEngine(SearxTestCase):
<h3> <h3>
<span id='title_stars_2' name='title_stars_2'> </span> <span id='title_stars_2' name='title_stars_2'> </span>
</h3> </h3>
<p class='desc'> <p class='desc clk'>
This should be the content. This should be the content.
</p> </p>
<p> <p>