Applicative text parser

TODO: markdown & gemini coming
This commit is contained in:
orbifx 2021-02-25 23:22:35 +00:00
commit c15e8e2a0d
26 changed files with 557 additions and 0 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
_build
*.txt
*.merlin

23
bin/cli.ml Normal file
View File

@ -0,0 +1,23 @@
module Test = struct
type t = unit
let blank_line () = print_string "{bl}"
let angled_uri s () = print_string ("{>}" ^ s ^ "{<}")
let plain_text s () = print_string s
let heading_hashbang i s () = print_string (string_of_int i ^ s)
let paragraph_s () = print_string "{p>}"
let paragraph_e () = print_string "{<p}"
let key_value a b () = print_endline (a ^"~"^ String.trim b)
end
let () =
let string_of_file filename =
let ch = open_in filename in
let s = really_input_string ch (in_channel_length ch) in
close_in ch;
s in
let filename = Sys.argv.(1) in
(* let module Parse = Text.MakeSimple (Html) in *)
let module Parse = Parsers.Plain_text.Make (Test) in
(*let subsyntaxes = [| (module Parser.Key_value.Make (Test) : Text.Parser with type t = Test.t) |] in*)
(*let of_string text acc = Text.parse subsyntaxes { text; pos = 0; right_boundary = String.length text - 1 } acc in*)
Parse.of_string (string_of_file filename) ()

4
bin/dune Normal file
View File

@ -0,0 +1,4 @@
(executable
(name cli)
(modules cli)
(libraries text parsers))

3
converters/dune Normal file
View File

@ -0,0 +1,3 @@
(library
(name converter)
(public_name text_parse.converter))

28
converters/gemini.ml Normal file
View File

@ -0,0 +1,28 @@
type t = string
let blank_line a = a ^ "\n"
let plain_text s a = a ^ s
let sentence_s a = a ^ ""
let sentence_e a = a ^ " "
let sentence_segment s a = a ^ " " ^ s
let reference_name n a = a ^ "[" ^ n ^ "]"
let bracketed_referent_s n a = a ^ "[" ^ n ^ "]: "
let bracketed_referent_e a = a ^ "\n"
let angled_uri u a = a ^ "\n=> " ^ u
let bold t a = a ^ "*" ^ t ^ "*"
let italic t a = a ^ "/" ^ t ^ "/"
let underline t a = a ^ "_" ^ t ^ "_"
let inline_monospace t a = a ^ "`" ^ t ^ "`"
let heading_hashbang lvl h a = a ^ String.make lvl '#' ^ h ^ "\n"
let paragraph_s a = a
let paragraph_e a = a
let preformatted s a = a ^ "<pre>" ^ s ^ "</pre>"
let bullet_list_s a = a
let bullet_list_e a = a
let bullet_item_s ch a = a ^ Char.escaped ch
let bullet_item_e a = a ^ "\n"
let ordered_list_s a = a
let ordered_list_e a = a
let ordered_item_s = bullet_item_s
let ordered_item_e = bullet_item_e
let key_value_pair k v a = prerr_endline @@ k ^ "~" ^ v; a

39
converters/html.ml Normal file
View File

@ -0,0 +1,39 @@
let esc x =
let fn a c = match c with
| '&' -> a ^ "&amp;"
| '<' -> a ^ "&lt;"
| '"' -> a ^ "&quot;"
| '\''-> a ^ "&apos;"
| x -> a ^ String.make 1 x
in
Seq.fold_left fn "" (String.to_seq x)
type t = string
let blank_line a = a ^ ""
let plain_text s a = a ^ esc s
let sentence_s a = a ^ ""
let sentence_e a = a ^ " "
let sentence_segment s a = a ^ esc s ^ " "
let reference_name n a = a ^ {|<a href="#|} ^ n ^ {|">|} ^ esc n ^ "</a>"
let bracketed_referent_s n a = a ^ {|<a id="|} ^ n ^ {|">|} ^ esc n ^ "</a>: "
let bracketed_referent_e a = a ^ "<br/>"
let angled_uri u a = a ^ {|&lt;<a href="|} ^ u ^ {|">|} ^ esc u ^ {|</a>&gt;|}
let bold t a = a ^ "<b>" ^ esc t ^ "</b>"
let italic t a = a ^ "<i>" ^ esc t ^ "</i>"
let underline t a = a ^ "<u>" ^ esc t ^ "</u>"
let inline_monospace t a = a ^ "<code>" ^ esc t ^ "</code>"
let heading_hashbang lvl h a =
let lvl = string_of_int lvl in
a ^ "<h" ^ lvl ^ " id=\"" ^ esc (String.lowercase_ascii h) ^"\">" ^ esc h ^ "</h" ^ lvl ^ ">"
let paragraph_s a = a ^ "<p>"
let paragraph_e a = a ^ "</p>"
let preformatted s a = a ^ "<pre>" ^ esc s ^ "</pre>"
let bullet_list_s a = a ^ "<ul>"
let bullet_list_e a = a ^ "</ul>"
let bullet_item_s _ch a = a ^ "<li>"
let bullet_item_e a = a ^ "</li>"
let ordered_list_s a = a ^ "<ol>"
let ordered_list_e a = a ^ "</ol>"
let ordered_item_s = bullet_item_s
let ordered_item_e = bullet_item_e
let key_value k v a = prerr_endline @@ k ^ "~" ^ v; a

22
cursor.ml Normal file
View File

@ -0,0 +1,22 @@
type t = { text : string; pos : int; right_boundary : int }
let overran cursor = cursor.pos >= cursor.right_boundary
let next_char cursor = { cursor with pos = cursor.pos + 1 }
let char_at cur offset = String.get cur.text (cur.pos + offset)
let char cur = String.get cur.text cur.pos
let distance a b = b.pos - a.pos
let sub ?left ?right cur = { cur with
pos = Option.value left ~default:cur.pos;
right_boundary = Option.value right ~default:cur.right_boundary }
let unwrap num cur = sub ~left:(cur.pos+num) ~right:(cur.right_boundary-num) cur
let segment_string cur = String.sub cur.text cur.pos (cur.right_boundary - cur.pos)
(*todo: reconsider +1 result and type cursor*)
let rec find_end e = function
| cur when cur.pos + 1 = String.length cur.text -> Some cur.pos
| cur when overran cur -> None
| cur when e cur (char cur) -> Some (cur.pos + 1)
| cur -> find_end e (next_char cur)

4
dune Normal file
View File

@ -0,0 +1,4 @@
(library
(name text_parse)
(public_name text_parse)
(modules parser syntax cursor))

16
dune-project Normal file
View File

@ -0,0 +1,16 @@
(lang dune 2.4)
(name text_parse)
(version 1.02)
(homepage "https://cgit.orbitalfox.eu/text-parse-ml")
(source (uri git://orbitalfox.eu/text-parse-ml))
(license EUPL)
(authors "orbifx")
(maintainers "fox@orbitalfox.eu")
(bug_reports "mailto:fox@orbitalfox.eu?subject=Text parse ML Issue:")
(generate_opam_files true)
(package
(name text_parse)
(synopsis "Applicative text parsing"))

37
parser.ml Normal file
View File

@ -0,0 +1,37 @@
module type S = sig
include Syntax.S
type t
val parse: Cursor.t -> t -> t
end
module type Sub_parsers = sig
type t
val subparsers: (module S with type t = t) array
end
let at s e cur ch = if s cur ch then Cursor.find_end e cur else None
let apply_default (type a) (module P: S with type t = a) (acc: a) cursor_default cursor =
if cursor_default = cursor then acc
else P.parse (Cursor.sub ~right:(cursor.Cursor.pos) cursor_default) acc
let rec branch: type a. ?idx:int -> a -> Cursor.t -> Cursor.t -> (module S with type t = a) array -> (a * Cursor.t) =
fun ?idx:(i=1) acc cursor_default cursor syntaxes ->
if Cursor.overran cursor then (apply_default syntaxes.(0) acc cursor_default cursor), cursor
else
try let (module P: S with type t = a) = syntaxes.(i) in
(match at P.s P.e cursor (Cursor.char cursor) with
| Some right ->
let acc = apply_default syntaxes.(0) acc cursor_default cursor in
let acc = P.parse (Cursor.sub ~right cursor) acc in
let cursor = Cursor.sub ~left:right cursor in
branch acc cursor cursor syntaxes
| None | exception Invalid_argument _ -> branch ~idx:(i+1) acc cursor_default cursor syntaxes)
with Invalid_argument _ ->
branch acc cursor_default (Cursor.next_char cursor) syntaxes
let rec parse subsyntaxes cursor acc =
if Cursor.overran cursor then acc
else
let acc, cursor = branch acc cursor cursor subsyntaxes in
parse subsyntaxes cursor acc

11
parsers/blank_line.ml Normal file
View File

@ -0,0 +1,11 @@
module type Fn = sig
type t
val blank_line: t -> t
end
module Make (F : Fn) = struct
type t = F.t
let s _cur = function '\n' -> true | _ -> false
let e _cur = function '\n' -> true | _ -> false
let parse _cursor acc = F.blank_line acc
end

31
parsers/bullet.ml Normal file
View File

@ -0,0 +1,31 @@
module type Fn = sig
type t
val bullet_item_s: char -> 'a -> 'a
val bullet_item_e: 'a -> 'a
val bullet_list_s: 'a -> 'a
val bullet_list_e: 'a -> 'a
end
open Text_parse.Parser
open Text_parse.Syntax
open Text_parse.Cursor
module Item (F : Fn) = struct
type t = F.t
let s _cursor = function '-' | '+' | '*' -> true | _ -> false
let e cursor _ch = newline (char_at cursor 1)
let subsyntaxes = [||]
let parse cur acc =
let bullet_char = char cur in
let left = match find_end (fun _cur c -> c <> ' ') { cur with pos = cur.pos + 1 }
with Some x -> x-1 | None -> 0 in
F.bullet_item_s bullet_char acc |> parse subsyntaxes (sub ~left cur) |> F.bullet_item_e
end
module List (F : Fn) = struct
type t = F.t
let s _cursor = function '-' | '+' | '*' -> true | _ -> false
let e cursor _ch = newline (char_at cursor 1) && not (s cursor (char_at cursor 2))
let subsyntaxes = [| (module Item (F) : Text_parse.Parser.S with type t = F.t) |]
let parse cur acc = F.bullet_list_s acc |> parse subsyntaxes cur |> F.bullet_list_e
end

4
parsers/dune Normal file
View File

@ -0,0 +1,4 @@
(library
(name parsers)
(public_name text_parse.parsers)
(libraries text_parse))

32
parsers/emphasis.ml Normal file
View File

@ -0,0 +1,32 @@
module type Fn = sig
val bold: string -> 'a -> 'a
val italic: string -> 'a -> 'a
val underline: string -> 'a -> 'a
val inline_monospace: string -> 'a -> 'a
end
open Text_parse.Cursor
module Bold (F : Fn) = struct
let s _cursor = function '*' -> true | _ -> false
let e = s
let parse cur acc = F.bold (segment_string (unwrap 1 cur)) acc
end
module Italic (F : Fn) = struct
let s _cursor = function '/' -> true | _ -> false
let e = s
let parse cur acc = F.italic (segment_string (unwrap 1 cur)) acc
end
module Underline (F : Fn) = struct
let s _cursor = function '_' -> true | _ -> false
let e = s
let parse cur acc = F.underline (segment_string (unwrap 1 cur)) acc
end
module Inline_monospace (F : Fn) = struct
let s _cursor = function '`' -> true | _ -> false
let e = s
let parse cur acc = F.inline_monospace (segment_string (unwrap 1 cur)) acc
end

17
parsers/heading.ml Normal file
View File

@ -0,0 +1,17 @@
module type Fn = sig
type t
val heading_hashbang: int -> string -> t -> t
end
open Text_parse.Syntax
open Text_parse.Cursor
module Hashbang (F : Fn) = struct
type t = F.t
let s _cur = function '#' -> true | _ -> false
let e _cur = newline
let parse cursor acc =
let level = match find_end (fun _cur c -> c <> '#') cursor with
Some x -> x - cursor.pos - 1 | None -> 0 in
F.heading_hashbang level (segment_string { cursor with pos = cursor.pos + level + 1; right_boundary = cursor.right_boundary-1 }) acc
end

19
parsers/key_value.ml Normal file
View File

@ -0,0 +1,19 @@
module type Fn = sig
type t
val key_value: string -> string -> t -> t
end
open Text_parse.Syntax
open Text_parse.Cursor
module Make (F : Fn) = struct
type t = F.t
let s _cur c = letter c
let e _cur c = newline c
let parse cursor acc =
let colon_pos = match find_end (fun _cur c -> c = ':') cursor with
Some x -> x - cursor.pos - 1 | None -> 0 in (*todo:None shouldn't be allowed by scope*)
let key = segment_string { cursor with right_boundary = cursor.pos+colon_pos } in
let value = segment_string { cursor with pos = cursor.pos+colon_pos+1; right_boundary = cursor.right_boundary } in
F.key_value key value acc
end

27
parsers/markdown.ml Normal file
View File

@ -0,0 +1,27 @@
module type Markdown_t = sig
include Blank_line.Fn
include Reference.Fn with type t := t
include Bullet.Fn with type t := t
include Ordered.Fn with type t := t
include Heading.Fn with type t := t
include Preformatted.Fn with type t := t
include Paragraph.Fn with type t := t
end
open Text_parse.Parser
open Text_parse.Cursor
module Make (F : Markdown_t) = struct
let subsyntaxes = [|
(module Blank_line.Make (F) : Text_parse.Parser.S with type t = F.t);
(module Heading.Hashbang (F));
(module Reference.Referent (F));
(module Bullet.List (F));
(module Ordered.List (F));
(module Preformatted.Tabbed (F));
(*(module Paragraph.Make (F));*)
|]
let of_string text acc =
parse subsyntaxes { text; pos = 0; right_boundary = String.length text - 1 } acc
end

37
parsers/ordered.ml Normal file
View File

@ -0,0 +1,37 @@
module type Fn = sig
type t
val ordered_item_s: char -> 'a -> 'a
val ordered_item_e: 'a -> 'a
val ordered_list_s: 'a -> 'a
val ordered_list_e: 'a -> 'a
end
open Text_parse.Parser
open Text_parse.Syntax
open Text_parse.Cursor
module Item (F : Fn) = struct
type t = F.t
let s cur ch =
let is_enum c = (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') in
let is_delim c = c = '.' || c = ')' in
is_enum ch && is_delim (char_at cur 1)
let e cursor _ch = newline (char_at cursor 1)
let subsyntaxes = [||]
let parse cur acc =
let bullet_char = char cur in
let left = match find_end (fun _cur c -> c <> ' ') { cur with pos = cur.pos + 1 }
with Some x -> x | None -> 0 in
F.ordered_item_s bullet_char acc |> parse subsyntaxes (sub ~left cur) |> F.ordered_item_e
end
module List (F: Fn) = struct
type t = F.t
let s cur ch =
let is_enum c = (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') in
let is_delim c = c = '.' || c = ')' in (*todo: can't have `.` if sentence ends with it and lists are in sense*)
is_enum ch && is_delim (char_at cur 1)
let e cursor _ch = newline (char_at cursor 1) && newline (char_at cursor 2)(* not (s {cursor with pos = cursor.pos+2} (char_at cursor 2)) *)
let subsyntaxes = [| (module Item (F) : Text_parse.Parser.S with type t = F.t) |]
let parse cur acc = F.ordered_list_s acc |> parse subsyntaxes cur |> F.ordered_list_e
end

19
parsers/paragraph.ml Normal file
View File

@ -0,0 +1,19 @@
module type Fn = sig
type t
val paragraph_s: t -> t
val paragraph_e: t -> t
end
open Text_parse.Parser
open Text_parse.Syntax
open Text_parse.Cursor
module Make (F : Fn)(S : Text_parse.Parser.Sub_parsers with type t = F.t) = struct
type t = F.t
let s _cur ch = printable ch
let e cur = function
| '\n' -> char_at cur (-1) = '\n'
| _ when cur.pos + 1 = cur.right_boundary -> true
| _ -> false
let parse cur acc = F.paragraph_s acc |> parse S.subparsers cur |> F.paragraph_e
end

36
parsers/plain_text.ml Normal file
View File

@ -0,0 +1,36 @@
module type Fn = sig
type t
val plain_text: string -> t -> t
end
open Text_parse.Parser
open Text_parse.Cursor
module Plain_text (F : Fn) = struct
type t = F.t
let s _cursor _ch = true
let e cursor = function
| '\n' -> char_at cursor (-1) = '\n'
| _ when cursor.pos + 1 = cursor.right_boundary -> true
| _ -> false
let parse cur acc = F.plain_text (segment_string cur) acc
end
module type Plain_text_t = sig
include Blank_line.Fn
include Heading.Fn with type t := t
include Uri.Fn with type t := t
include Paragraph.Fn with type t := t
include Fn with type t := t
end
module Make (F : Plain_text_t) = struct
module P = struct
type t = F.t
let subparsers = [| (module Plain_text (F) : Text_parse.Parser.S with type t = F.t); (module Uri.Angled (F)) |]
end
let subparsers = [| (module Paragraph.Make (F)(P) : Text_parse.Parser.S with type t = F.t); (module Blank_line.Make (F)); (module Heading.Hashbang (F)); (module Paragraph.Make (F)(P)); |]
let of_string text acc = parse subparsers { text; pos = 0; right_boundary = String.length text - 1 } acc
end

13
parsers/preformatted.ml Normal file
View File

@ -0,0 +1,13 @@
module type Fn = sig
type t
val tab_preformatted: string -> t -> t
end
open Text_parse.Cursor
module Tabbed (F : Fn) = struct
type t = F.t
let s _cur ch = '\t' = ch
let e cur = function '\n' -> not (char_at cur 1 = '\t') | _ -> false
let parse cur acc = F.tab_preformatted (segment_string cur) acc
end

31
parsers/reference.ml Normal file
View File

@ -0,0 +1,31 @@
module type Fn = sig
type t
val reference_name: string -> string -> t -> t
val referent_s: string -> t -> t
val referent_e: t -> t
end
open Text_parse.Parser
open Text_parse.Cursor
open Text_parse.Syntax
module Name (F : Fn) = struct
type t = F.t
let s _cursor = function '[' -> true | _ -> false
let e _cursor = function ']' -> true | _ -> false
let parse cur acc = F.reference_name (segment_string (unwrap 1 cur)) acc
end
module Referent (F : Fn) = struct
type t = F.t
let find_name_end = find_end (fun cur c -> c = ']' && (char_at cur 1) = ':')
let s cur = function '[' -> Option.is_some (find_name_end cur) | _ -> false
let e _cur = newline
let subsyntaxes = [| |]
let parse cur acc =
let name_boundary = match find_name_end cur with Some x -> x | None -> 0 in
let name = segment_string { cur with pos = cur.pos+1; right_boundary = name_boundary-1 } in
let text_cur = { cur with pos = name_boundary+2 } in
F.referent_s name acc |> parse subsyntaxes text_cur |> F.referent_e
end

37
parsers/sentence.ml Normal file
View File

@ -0,0 +1,37 @@
module type Fn = sig
val sentence_segment: string -> 'a -> 'a
val sentence_s: 'a -> 'a
val sentence_e: 'a -> 'a
end
open Text_parse.Parser
open Text_parse.Syntax
open Text_parse.Cursor
module Sentence_segment (F : Fn) = struct
let s _cursor = printable
let e cursor = function
| '.' -> char_at cursor 1 = ' ' || newline (char_at cursor 1) (* todo punctuations *)
| '\n' -> char_at cursor 1 = '\n'
| _ when cursor.pos + 1 = cursor.right_boundary -> true
| _ when char_at cursor 1 = '[' -> true
| _ when char_at cursor 1 = '*' -> true
| _ when char_at cursor 1 = '_' -> true
| _ when char_at cursor 1 = '/' -> true
| _ when char_at cursor 1 = '`' -> true
| _ when char_at cursor 1 = '<' -> true
| _ -> false
let at = at s e
let parse cur acc = F.sentence_segment (segment_string cur) acc
end
module Sentence (F : Fn) = struct
let s _cursor = printable
let e cursor = function
| '.' -> char_at cursor 1 = ' ' (* todo punctuations *)
| '\n' -> char_at cursor 1 = '\n'
| _ -> false
let at = at s e
let subsyntaxes = [| |]
let parse cur acc = F.sentence_s acc |> parse subsyntaxes cur |> F.sentence_e
end

27
parsers/uri.ml Normal file
View File

@ -0,0 +1,27 @@
module type Fn = sig
type t
val angled_uri: string -> t -> t
end
open Text_parse.Syntax
open Text_parse.Cursor
module Angled (F : Fn) = struct
type t = F.t
let s cur = function '<' -> letter (char_at cur 1) | _ -> false
let e _cur = function '>' -> true | _ -> false
let parse cur acc = F.angled_uri (segment_string (unwrap 1 cur)) acc
end
(* module Uri (F : TextFn) = struct
* type t = F.t
* let rec is_scheme cur = function
* | ':' -> true
* | ch when letter ch -> is_scheme (next_char cur) (char_at cur 1)
* | _ -> false
* let s cur ch = letter ch && is_scheme (next_char cur) (char_at cur 1)
* let e cur _ch = match char_at cur 1 with '\n' | ' ' -> true | _ -> false
* let at = at s e
* let parse cur acc = F.angled_uri (segment_string cur) acc
* end *)

10
syntax.ml Normal file
View File

@ -0,0 +1,10 @@
module type S = sig
val s: Cursor.t -> char -> bool
val e: Cursor.t -> char -> bool
end
(*let str c = String.make 1 c*)
let newline = function '\n' -> true | _ -> false
let printable ch = ch >= ' ' && ch <= '~'
let letter ch = (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')

27
text_parse.opam Normal file
View File

@ -0,0 +1,27 @@
# This file is generated by dune, edit dune-project instead
opam-version: "2.0"
version: "1.02"
synopsis: "Applicative text parsing"
maintainer: ["fox@orbitalfox.eu"]
authors: ["orbifx"]
license: "EUPL"
homepage: "https://cgit.orbitalfox.eu/text-parse-ml"
bug-reports: "mailto:fox@orbitalfox.eu?subject=Text parse ML Issue:"
depends: [
"dune" {>= "2.4"}
]
build: [
["dune" "subst"] {pinned}
[
"dune"
"build"
"-p"
name
"-j"
jobs
"@install"
"@runtest" {with-test}
"@doc" {with-doc}
]
]
dev-repo: "git://orbitalfox.eu/text-parse-ml"