Renewed gemtext2html function

This commit is contained in:
faildev_mode 2023-07-19 08:04:19 +02:00
parent f3a48ec9e6
commit 172f017090
No known key found for this signature in database
GPG Key ID: 70845C70C0F5E205
2 changed files with 65 additions and 61 deletions

View File

@ -14,8 +14,8 @@ import traceback
from item import Item # from itself
from files import save_this, link_this, scan_dir, delete_this # from itself
from config import config # from itself
import html
from bs4 import BeautifulSoup # from beautifulsoup4
from gemtext import GemtextParser # from itself
def template_for(path: str) -> str:
"""Determine used template for the file
@ -111,70 +111,65 @@ def abs2rel(match: re.Match, path: str) -> str:
print(' M', url)
return match.group(1) + url
def gemtext2html(gemtext: str) -> str:
def gemtext2html(parser: GemtextParser, rotate_extension=True) -> BeautifulSoup:
"""Converts gemtext to html format"""
gemtext = gemtext.split('\n')
html_data = []
soup = BeautifulSoup()
preformat = in_line = False
for line in gemtext:
if match := re.match(r'^```(.*)', line):
if preformat := not preformat:
if match.group(1):
html_data.append(f'<pre language="{html.escape(match.group(1))}">')
else:
html_data.append(f'<pre>')
for item in parser.elements:
if item.type == 'plain':
paragraphs = item.content.split('\n\n')
el = BeautifulSoup()
for content in paragraphs:
p = soup.new_tag('p')
p.append(content)
el.append(p)
if item.type == 'link':
# put image if it's link to image file
if os.path.splitext(item.href)[1] in ('.jpg', '.png', '.gif'):
el = soup.new_tag('img', src=item.href)
if item.label:
el.attrs['alt'] = item.label
else:
html_data.append('</pre>')
continue
href = item.href
if rotate_extension and href.endswith('.gmi') and not re.match(r'^[\w]+:', href):
href = os.path.splitext(href)[0] + '.html'
el = soup.new_tag('a', href=href)
if item.label:
el.append(item.label)
if preformat:
html_data.append(html.escape(line))
continue
elif item.type == 'preformatted':
el = soup.new_tag('pre')
el.append(item.content)
if el.alt:
el.attrs['title'] = el.alt
if not line: continue # skip empty lines
elif item.type == 'list':
el = soup.new_tag('ul')
for list_item in item.items:
li = soup.new_tag('li')
li.append(list_item)
el.append(li)
if match := re.match(r'^\*\s+(.*)', line):
if not in_line:
in_line = True
html_data.append('<ul>')
html_data.append(f'<li>{html.escape(match.group(1))}</li>')
else:
if in_line:
in_line = False
html_data.append('</ul>')
if match := re.match(r'^#\s+(.*)', line):
html_data.append(f'<h1>{html.escape(match.group(1))}</h1>')
elif match := re.match(r'^##\s+(.*)', line):
html_data.append(f'<h2>{html.escape(match.group(1))}</h2>')
elif match := re.match(r'^###\s+(.*)', line):
html_data.append(f'<h3>{html.escape(match.group(1))}</h3>')
elif match := re.match(r'^=>\s+(\S+)(\s+.*)?', line):
url = match.group(1)
text = match.group(2).strip() if match.group(2) else None
# rotate file extension
if url.endswith('.gmi') and not re.match(r'^[-\w]+:', url):
url = os.path.splitext(url)[0] + '.html'
# if links points to image, display it instead
if os.path.splitext(url)[1] in ('.jpg', '.png', '.gif'):
if text:
html_data.append(f'<img src="{html.escape(url)}" alt="{html.escape(text)}">')
else:
html_data.append(f'<img src="{html.escape(url)}">')
else:
html_data.append(f'<a href="{html.escape(url)}">{html.escape(text or url)}</a>')
elif match := re.match(r'^>\s*(.*)', line):
html_data.append(f'<blockquote>{html.escape(match.group(1))}</blockquote>')
else:
html_data.append(f'<p>{html.escape(line)}</p>')
# close tags
if preformat:
html_data.append('</code>')
elif in_line:
html_data.append('</ul>')
return '\n'.join(html_data)
elif item.type == 'head1':
el = soup.new_tag('h1')
el.append(item.content)
elif item.type == 'head2':
el = soup.new_tag('h2')
el.append(item.content)
elif item.type == 'head3':
el = soup.new_tag('h3')
el.append(item.content)
elif item.type == 'quote':
el = soup.new_tag('blockquote')
# FIXME: \n to <br>
soup.append(el)
return soup
def convert_href(href: str, path: str) -> str:
"""Redirects URLs and converts paths to relative.
@ -256,6 +251,7 @@ if __name__ == '__main__':
)
content = evaluate_this(tpl_item.content, namespace)
# """
# do redirections (http/https only!)
if 'redirections' in config:
content = re.sub(r'(https?://)([-\w]+[-\w.]*)(/\S*)?', partial(
@ -268,6 +264,13 @@ if __name__ == '__main__':
content = re.sub(r'(=>[ \t]+)/(\S+)', partial(
abs2rel, path=path
), content)
# """
# redirections, path conversion
if path.endswith('.gmi'):
for line in content.strip().split('\n'):
if line.startswith('=> '):
...
# save results
save_this('../'+path, content)
@ -305,7 +308,9 @@ if __name__ == '__main__':
# convert to html
if path.endswith('.gmi'):
content = gemtext2html(content)
parser = GemtextParser(content)
soup = gemtext2html(parser)
content = soup.prettify()
if html_path.endswith('.html'):
# TODO: html template
@ -313,7 +318,6 @@ if __name__ == '__main__':
...
# redirections, path conversion
soup = BeautifulSoup(content, features='html.parser')
for attr in 'href', 'src', 'action':
for node in soup.css.select('['+attr+']'):

View File

@ -7,7 +7,7 @@ line_patterns = {
'head1': re.compile(r'#\s+(.*)'),
'head2': re.compile(r'##\s+(.*)'),
'head3': re.compile(r'###\s+(.*)'),
'quote': re.compile(r'>\s+(.*)')
'quote': re.compile(r'>\s*(.*)')
}
class GemtextParser: