Renewed gemtext2html function

2023-07-19 08:04:19 +02:00 · 2023-07-19 08:04:19 +02:00 · 172f017090
parent f3a48ec9e6
commit 172f017090
2 changed files with 65 additions and 61 deletions
--- a/_src/build.py
+++ b/_src/build.py
@ -14,8 +14,8 @@ import traceback
 from item import Item # from itself
 from files import save_this, link_this, scan_dir, delete_this # from itself
 from config import config # from itself
-import html
 from bs4 import BeautifulSoup # from beautifulsoup4
+from gemtext import GemtextParser # from itself

 def template_for(path: str) -> str:
    """Determine used template for the file
@ -111,70 +111,65 @@ def abs2rel(match: re.Match, path: str) -> str:
    print(' M', url)
    return match.group(1) + url

-def gemtext2html(gemtext: str) -> str:
+def gemtext2html(parser: GemtextParser, rotate_extension=True) -> BeautifulSoup:
    """Converts gemtext to html format"""
    
-    gemtext = gemtext.split('\n')
-    html_data = []
+    soup = BeautifulSoup()
    
-    preformat = in_line = False
-    for line in gemtext:
-        if match := re.match(r'^```(.*)', line):
-            if preformat := not preformat:
-                if match.group(1):
-                    html_data.append(f'<pre language="{html.escape(match.group(1))}">')
-                else:
-                    html_data.append(f'<pre>')
+    for item in parser.elements:
+        if item.type == 'plain':
+            paragraphs = item.content.split('\n\n')
+            el = BeautifulSoup()
+            for content in paragraphs:
+                p = soup.new_tag('p')
+                p.append(content)
+                el.append(p)
+        
+        if item.type == 'link':
+            # put image if it's link to image file
+            if os.path.splitext(item.href)[1] in ('.jpg', '.png', '.gif'):
+                el = soup.new_tag('img', src=item.href)
+                if item.label:
+                    el.attrs['alt'] = item.label
            else:
-                html_data.append('</pre>')
-            continue
+                href = item.href
+                if rotate_extension and href.endswith('.gmi') and not re.match(r'^[\w]+:', href):
+                    href = os.path.splitext(href)[0] + '.html'
+                el = soup.new_tag('a', href=href)
+                if item.label:
+                    el.append(item.label)
        
-        if preformat:
-            html_data.append(html.escape(line))
-            continue
+        elif item.type == 'preformatted':
+            el = soup.new_tag('pre')
+            el.append(item.content)
+            if el.alt:
+                el.attrs['title'] = el.alt
        
-        if not line: continue # skip empty lines
+        elif item.type == 'list':
+            el = soup.new_tag('ul')
+            for list_item in item.items:
+                li = soup.new_tag('li')
+                li.append(list_item)
+                el.append(li)
        
-        if match := re.match(r'^\*\s+(.*)', line):
-            if not in_line:
-                in_line = True
-                html_data.append('<ul>')
-            html_data.append(f'<li>{html.escape(match.group(1))}</li>')
-        else:
-            if in_line:
-                in_line = False
-                html_data.append('</ul>')
-            if match := re.match(r'^#\s+(.*)', line):
-                html_data.append(f'<h1>{html.escape(match.group(1))}</h1>')
-            elif match := re.match(r'^##\s+(.*)', line):
-                html_data.append(f'<h2>{html.escape(match.group(1))}</h2>')
-            elif match := re.match(r'^###\s+(.*)', line):
-                html_data.append(f'<h3>{html.escape(match.group(1))}</h3>')
-            elif match := re.match(r'^=>\s+(\S+)(\s+.*)?', line):
-                url = match.group(1)
-                text = match.group(2).strip() if match.group(2) else None
-                # rotate file extension
-                if url.endswith('.gmi') and not re.match(r'^[-\w]+:', url):
-                    url = os.path.splitext(url)[0] + '.html'
-                # if links points to image, display it instead
-                if os.path.splitext(url)[1] in ('.jpg', '.png', '.gif'):
-                    if text:
-                        html_data.append(f'<img src="{html.escape(url)}" alt="{html.escape(text)}">')
-                    else:
-                        html_data.append(f'<img src="{html.escape(url)}">')
-                else:
-                    html_data.append(f'<a href="{html.escape(url)}">{html.escape(text or url)}</a>')
-            elif match := re.match(r'^>\s*(.*)', line):
-                html_data.append(f'<blockquote>{html.escape(match.group(1))}</blockquote>')
-            else:
-                html_data.append(f'<p>{html.escape(line)}</p>')
-    # close tags
-    if preformat:
-        html_data.append('</code>')
-    elif in_line:
-        html_data.append('</ul>')
-    
-    return '\n'.join(html_data)
+        elif item.type == 'head1':
+            el = soup.new_tag('h1')
+            el.append(item.content)
+        
+        elif item.type == 'head2':
+            el = soup.new_tag('h2')
+            el.append(item.content)
+        
+        elif item.type == 'head3':
+            el = soup.new_tag('h3')
+            el.append(item.content)
+        
+        elif item.type == 'quote':
+            el = soup.new_tag('blockquote')
+            # FIXME: \n to <br>
+        
+        soup.append(el)
+    return soup

 def convert_href(href: str, path: str) -> str:
    """Redirects URLs and converts paths to relative.
@ -256,6 +251,7 @@ if __name__ == '__main__':
                    )
                    content = evaluate_this(tpl_item.content, namespace)
            
+#            """
            # do redirections (http/https only!)
            if 'redirections' in config:
                content = re.sub(r'(https?://)([-\w]+[-\w.]*)(/\S*)?', partial(
@ -268,6 +264,13 @@ if __name__ == '__main__':
                content = re.sub(r'(=>[ \t]+)/(\S+)', partial(
                    abs2rel, path=path
                ), content)
+#            """
+            
+            # redirections, path conversion
+            if path.endswith('.gmi'):
+                for line in content.strip().split('\n'):
+                    if line.startswith('=> '):
+                        ...
            
            # save results
            save_this('../'+path, content)
@ -305,7 +308,9 @@ if __name__ == '__main__':
            
            # convert to html
            if path.endswith('.gmi'):
-                content = gemtext2html(content)
+                parser = GemtextParser(content)
+                soup = gemtext2html(parser)
+                content = soup.prettify()
            
            if html_path.endswith('.html'):            
                # TODO: html template
@ -313,7 +318,6 @@ if __name__ == '__main__':
                    ...
                
                # redirections, path conversion
-                
                soup = BeautifulSoup(content, features='html.parser')
                for attr in 'href', 'src', 'action':
                    for node in soup.css.select('['+attr+']'):
--- a/_src/gemtext.py
+++ b/_src/gemtext.py
@ -7,7 +7,7 @@ line_patterns = {
    'head1': re.compile(r'#\s+(.*)'),
    'head2': re.compile(r'##\s+(.*)'),
    'head3': re.compile(r'###\s+(.*)'),
-    'quote': re.compile(r'>\s+(.*)')
+    'quote': re.compile(r'>\s*(.*)')
 }

 class GemtextParser: