Compare commits

...

3 Commits

Author SHA1 Message Date
faildev_mode 172f017090
Renewed gemtext2html function 2023-07-19 08:04:19 +02:00
faildev_mode f3a48ec9e6
Bug fix: some properties were per-class, not per-object (gemtext.GemtextParser, item.Item), reassemble gemtext document (gemtext) 2023-07-18 22:16:58 +02:00
faildev_mode 3188379c9e
Added gemtext parser 2023-07-18 18:58:35 +02:00
3 changed files with 179 additions and 66 deletions

View File

@ -14,8 +14,8 @@ import traceback
from item import Item # from itself
from files import save_this, link_this, scan_dir, delete_this # from itself
from config import config # from itself
import html
from bs4 import BeautifulSoup # from beautifulsoup4
from gemtext import GemtextParser # from itself
def template_for(path: str) -> str:
"""Determine used template for the file
@ -111,70 +111,65 @@ def abs2rel(match: re.Match, path: str) -> str:
print(' M', url)
return match.group(1) + url
def gemtext2html(gemtext: str) -> str:
def gemtext2html(parser: GemtextParser, rotate_extension=True) -> BeautifulSoup:
"""Converts gemtext to html format"""
gemtext = gemtext.split('\n')
html_data = []
soup = BeautifulSoup()
preformat = in_line = False
for line in gemtext:
if match := re.match(r'^```(.*)', line):
if preformat := not preformat:
if match.group(1):
html_data.append(f'<pre language="{html.escape(match.group(1))}">')
else:
html_data.append(f'<pre>')
for item in parser.elements:
if item.type == 'plain':
paragraphs = item.content.split('\n\n')
el = BeautifulSoup()
for content in paragraphs:
p = soup.new_tag('p')
p.append(content)
el.append(p)
if item.type == 'link':
# put image if it's link to image file
if os.path.splitext(item.href)[1] in ('.jpg', '.png', '.gif'):
el = soup.new_tag('img', src=item.href)
if item.label:
el.attrs['alt'] = item.label
else:
html_data.append('</pre>')
continue
href = item.href
if rotate_extension and href.endswith('.gmi') and not re.match(r'^[\w]+:', href):
href = os.path.splitext(href)[0] + '.html'
el = soup.new_tag('a', href=href)
if item.label:
el.append(item.label)
if preformat:
html_data.append(html.escape(line))
continue
elif item.type == 'preformatted':
el = soup.new_tag('pre')
el.append(item.content)
if el.alt:
el.attrs['title'] = el.alt
if not line: continue # skip empty lines
elif item.type == 'list':
el = soup.new_tag('ul')
for list_item in item.items:
li = soup.new_tag('li')
li.append(list_item)
el.append(li)
if match := re.match(r'^\*\s+(.*)', line):
if not in_line:
in_line = True
html_data.append('<ul>')
html_data.append(f'<li>{html.escape(match.group(1))}</li>')
else:
if in_line:
in_line = False
html_data.append('</ul>')
if match := re.match(r'^#\s+(.*)', line):
html_data.append(f'<h1>{html.escape(match.group(1))}</h1>')
elif match := re.match(r'^##\s+(.*)', line):
html_data.append(f'<h2>{html.escape(match.group(1))}</h2>')
elif match := re.match(r'^###\s+(.*)', line):
html_data.append(f'<h3>{html.escape(match.group(1))}</h3>')
elif match := re.match(r'^=>\s+(\S+)(\s+.*)?', line):
url = match.group(1)
text = match.group(2).strip() if match.group(2) else None
# rotate file extension
if url.endswith('.gmi') and not re.match(r'^[-\w]+:', url):
url = os.path.splitext(url)[0] + '.html'
# if links points to image, display it instead
if os.path.splitext(url)[1] in ('.jpg', '.png', '.gif'):
if text:
html_data.append(f'<img src="{html.escape(url)}" alt="{html.escape(text)}">')
else:
html_data.append(f'<img src="{html.escape(url)}">')
else:
html_data.append(f'<a href="{html.escape(url)}">{html.escape(text or url)}</a>')
elif match := re.match(r'^>\s*(.*)', line):
html_data.append(f'<blockquote>{html.escape(match.group(1))}</blockquote>')
else:
html_data.append(f'<p>{html.escape(line)}</p>')
# close tags
if preformat:
html_data.append('</code>')
elif in_line:
html_data.append('</ul>')
return '\n'.join(html_data)
elif item.type == 'head1':
el = soup.new_tag('h1')
el.append(item.content)
elif item.type == 'head2':
el = soup.new_tag('h2')
el.append(item.content)
elif item.type == 'head3':
el = soup.new_tag('h3')
el.append(item.content)
elif item.type == 'quote':
el = soup.new_tag('blockquote')
# FIXME: \n to <br>
soup.append(el)
return soup
def convert_href(href: str, path: str) -> str:
"""Redirects URLs and converts paths to relative.
@ -256,6 +251,7 @@ if __name__ == '__main__':
)
content = evaluate_this(tpl_item.content, namespace)
# """
# do redirections (http/https only!)
if 'redirections' in config:
content = re.sub(r'(https?://)([-\w]+[-\w.]*)(/\S*)?', partial(
@ -268,6 +264,13 @@ if __name__ == '__main__':
content = re.sub(r'(=>[ \t]+)/(\S+)', partial(
abs2rel, path=path
), content)
# """
# redirections, path conversion
if path.endswith('.gmi'):
for line in content.strip().split('\n'):
if line.startswith('=> '):
...
# save results
save_this('../'+path, content)
@ -305,7 +308,9 @@ if __name__ == '__main__':
# convert to html
if path.endswith('.gmi'):
content = gemtext2html(content)
parser = GemtextParser(content)
soup = gemtext2html(parser)
content = soup.prettify()
if html_path.endswith('.html'):
# TODO: html template
@ -313,7 +318,6 @@ if __name__ == '__main__':
...
# redirections, path conversion
soup = BeautifulSoup(content, features='html.parser')
for attr in 'href', 'src', 'action':
for node in soup.css.select('['+attr+']'):

108
_src/gemtext.py Normal file
View File

@ -0,0 +1,108 @@
import re
line_patterns = {
'link': re.compile(r'=>(\s+)(\S+)(\s+(.*))?'),
'preformatted': re.compile(r'```(.*)'),
'list': re.compile(r'\*\s+(.*)'),
'head1': re.compile(r'#\s+(.*)'),
'head2': re.compile(r'##\s+(.*)'),
'head3': re.compile(r'###\s+(.*)'),
'quote': re.compile(r'>\s*(.*)')
}
class GemtextParser:
"""Provides abstract representation of gemtext file, that can be used in
conversion to html or modification in documents without rexeges.
"""
def __init__(self, content: str):
# add blank element to refer to as last_item in loop at start
self.elements = []
self.elements.append(GemtextElement(''))
self.elements[0].type = 'blank'
for line in content.split('\n'):
last_item = self.elements[-1]
if last_item.type == 'preformatted' and not last_item.closed:
if line == '```':
last_item.closed = True
else:
last_item.add_line(line)
else:
new = GemtextElement(line)
if last_item.type == new.type:
if new.type in ('plain', 'quote'):
last_item.add_line(new.content)
elif new.type == 'list':
last_item.items += new.items
else:
self.elements.append(new)
else:
self.elements.append(new)
# remove blank element
self.elements.pop(0)
def __str__(self):
return '\n'.join(str(x) for x in self.elements)
class GemtextElement:
"""Represents single Gemtext element. It is created from single line and
extended later.
"""
type = 'plain'
content = None
def __init__(self, line: str):
global line_pattern
for type, pattern in line_patterns.items():
if match := pattern.match(line):
self.type = type
if type == 'link':
self.href = match.group(2)
self.label = match.group(4)
elif type == 'list':
self.items = [match.group(1)]
elif type == 'preformatted':
self.alt = match.group(1) or None
self.content = ''
# to distinguish two consecutive preformatted blocks:
self.closed = False
else:
self.content = match.group(1)
break # there is no point of testing further
if self.type == 'plain':
self.content = line
def add_line(self, line: str):
if self.content == None: return
if self.content == '':
self.content = line or '\n' # add newline on empty line
else: self.content += '\n'+line
def __str__(self):
"""Gemtext reassembler"""
if self.type == 'plain':
return self.content
elif self.type == 'link':
return f'=> {self.href} {self.label}'
elif self.type == 'preformatted':
opening = '```'
if self.alt: opening += self.alt
return '\n'.join((opening, self.content, '```'))
elif self.type == 'list':
return '\n'.join('* '+x for x in self.items)
elif self.type == 'head1':
return '# ' + self.content
elif self.type == 'head2':
return '## ' + self.content
elif self.type == 'head3':
return '### ' + self.content
elif self.type == 'quote':
return '\n'.join('> '+x for x in self.content.split('\n'))

View File

@ -8,18 +8,19 @@ class Item(AttrDict):
"""This class represents single content file
It extracts all frontmatter fields using python-frontmatter module"""
title = None
tags = []
description = None
source = None
author = None
def __init__(self, path: str, prefix: str = 'content/'):
# initialize parent
super().__init__()
self.path, self.prefix = path, prefix
# initialize common fields
self.title = None
self.tags = []
self.description = None
self.source = None
self.author = None
frontmatter_data, content = frontmatter.parse(read_this(prefix + path))
self.content = content
self.frontmatter_data = frontmatter_data