From a9df1cccaa5c31734a276ff97592fa6763ba3bd0 Mon Sep 17 00:00:00 2001 From: Vovanium Date: Fri, 15 Apr 2022 18:32:10 +0300 Subject: [PATCH] + character coverage analysis tool --- .gitignore | 4 +- Makefile | 10 ++ charsets/README.charsets | 9 ++ tools/check-coverage.awk | 236 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 258 insertions(+), 1 deletion(-) create mode 100644 charsets/README.charsets create mode 100755 tools/check-coverage.awk diff --git a/.gitignore b/.gitignore index 7664704..4a79983 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ -*.bak \ No newline at end of file +*.bak +charsets/*.ucm +charsets/Blocks.txt \ No newline at end of file diff --git a/Makefile b/Makefile index a283aba..d818041 100644 --- a/Makefile +++ b/Makefile @@ -6,3 +6,13 @@ check-metrics-sans : check-metrics-sans-x check-metrics-sans-xb check-metrics-sa check-metrics-sans-x check-metrics-sans-xb check-metrics-sans-xi : check-metrics-sans-x% : awk -f tools/check-metrics.awk sans/salut-sans[0-9][0-9]$*.bdf + +.PHONY : check-coverage check-coverage-sans check-coverage-mono + +check-coverage : check-coverage-sans check-coverage-mono + +check-coverage-sans : + awk -f tools/check-coverage.awk sans/salut-sans*.bdf + +check-coverage-mono : + awk -f tools/check-coverage.awk mono/salut-mono*.bdf diff --git a/charsets/README.charsets b/charsets/README.charsets new file mode 100644 index 0000000..f611426 --- /dev/null +++ b/charsets/README.charsets @@ -0,0 +1,9 @@ +Place charset files here in UCM format. +They keep their own copyright, so I do not include them in the repository. +You can take them here: +https://github.com/unicode-org/icu-data/tree/main/charset/data/ucm + +It is also good to place unicode block list provided by unicode.org here. +The link is https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt + +(notice they can change the location, in this case please kinly bugreport broken link). \ No newline at end of file diff --git a/tools/check-coverage.awk b/tools/check-coverage.awk new file mode 100755 index 0000000..82257a5 --- /dev/null +++ b/tools/check-coverage.awk @@ -0,0 +1,236 @@ +#!/usr/bin/gawk -f + +BEGIN { + hex_digits = "0123456789ABCDEF"; + # assigned glyph counts in blocks (referenced by lowest codepoint) + ASSIGNED[hex("0000")] = 95; # ascii + ASSIGNED[hex("0080")] = 96; # latin1 + ASSIGNED[hex("2000")] = 83; # general punct. +} + +function assert(expression, message) { + if(!expression) { + print "DEBUG ASSERTION FAILED "message > "/dev/stderr"; + exit 1; + } +} + +function hex(x, i, y, d) { + y = 0; + for(i = 1; i <= length(x); i++) { + d = index(hex_digits, toupper(substr(x, i, 1))); + assert(d > 0 && d <= 16, "Hexadecimal character required"); + y = y * 16 + d - 1; + } + return y; +} + +function display_coverage_font(n, assigned) { + if(n == 0) { + return sprintf(" - "); + } else if(n == assigned) { + return sprintf(" ███"); + #} else if(0 < assigned - n && assigned - n < 10) { + # return sprintf(" -%1d%c", assigned-n, missing); + } else if(n <= 999) { + return sprintf(" %3d", n); + } else { + return sprintf("%3%%", n*100/assigned); + } +} + +function uc_range(name, first, last, assigned, j, k, n, missing, t) { + if(!assigned) assigned = ASSIGNED[first]; + if(!assigned) assigned = last - first + 1; + t = 0; + output = ""; + for(j = 1; j <= fontno; j += 1) { + n = 0; + for(k = first; k <= last; k += 1) { + ks = sprintf("%8d", k + 0) + #print "["ks","j"]", (ks SUBSEP j in exist) + if(ks SUBSEP j in exist) { + #print "!"k","j"("exist[k SUBSEP j]")" + n = n + 1; + } else { + missing = k; + } + } + t = t + n; + output = output display_coverage_font(n, assigned); + } + if(t > 0) { + printf("%04X..%04X: %s [%d] %s\n", first, last, output, assigned, name); + } +} + +function unicode_blocks(s, A) { + while((getline s < "charsets/Blocks.txt") > 0) { + if(match(s, "^([0-9A-F]+)\\.\\.([0-9A-F]+); (.*)$", A)) { + uc_range(A[3], hex(A[1]), hex(A[2])); + } + } +} + +function character_set_common(charset, filename, pattern, line, j, k, ks, n, code, assigned, missing, least_missing_n, least_missing_j, least_missing, A, T) { + printf("%-10.10s: ", charset); + n = 0; + PROCINFO["sorted_in"]=""; + #filename = "../charsets/"charset".ucm"; + #print filename; + #system(" less "filename) + assigned = 0; + least_missing_n = 99999; + while((getline line < filename) > 0) { + #printf("."); + if(match(line, pattern, A)) { + code = hex(A[1]); + if(code < 32 || code >= 127 && code < 160) { + # skip control characters + } else { + #printf(" "code) + if(!(code in T)) { + T[code] = 1; + assigned = assigned + 1; + } + } + } + } + close(filename); + for(j = 1; j <= fontno; j += 1) { + n = 0; + for(k in T) { + ks = sprintf("%8d", k + 0) + #print "["ks","j"]", (ks SUBSEP j in exist) + if(ks SUBSEP j in exist) { + #print "!"k","j"("exist[k SUBSEP j]")" + n = n + 1; + } else { + #printf("(%X)", k); + missing = k; + } + } + printf("%s", display_coverage_font(n, assigned)); + if(assigned - n > 0 && assigned - n < least_missing_n) { + least_missing_n = assigned - n; + least_missing = missing; + least_missing_j = j; + } + } + printf(" [%d] %s (%04X: %c @ %c)\n", assigned, charset, least_missing, least_missing, least_missing_j + 96); + +} + +function character_sets_in_dir(dir, postfix, pattern, list, s, charset) { + list = "find '"dir"' -type f -name '*" postfix "'"; + while((list | getline s) > 0) { + charset = s; + sub("^.*/", "", charset); + sub(postfix "$", "", charset); + character_set_common(charset, s, pattern); + } +} + +function all_character_sets() { + character_sets_in_dir("charsets", ".ucm", "^\\s+\\\\x[0-9A-F]+\\s+\\|0"); +} + +function console_set(charset) { + character_set_common(charset, "/usr/share/bdf2psf/"charset, "^U\\+([0-9A-F]+)$"); +} + +function all_console_sets(dir, pattern) { + dir = "/usr/share/bdf2psf"; + pattern = "^U\\+([0-9A-Fa-f]+)($|\\s)"; + character_sets_in_dir(dir, ".set", pattern); + character_sets_in_dir(dir, ".256", pattern); + character_sets_in_dir(dir, ".512", pattern); +} + +#function ada_set(charset) { +# character_set_common(charset, "../charsets/"charset".hex", "16\\#([0-9A-F]+)\#"); +#} + +BEGIN { +} + +$1 == "FONT" { + fontno += 1 + fontname[fontno] = $2 + fontid[fontno] = FILENAME + name = "" + code = "" +} + +$1 == "PIXEL_SIZE" { + pixelsize[fontno] = $2 +} + +$1 == "STARTCHAR" { + name = substr($0, 10) +} + +$1 == "ENCODING" { + code = sprintf("%8d", $2 + 0) + charname[code] = name + exist[code,fontno] = 1 +} + +BEGIN { + PROCINFO["sorted_in"] = "@ind_val_asc" + threshold = 0.707 +} + +function abs(x) { + return x<0? -x: x +} + +END { + for(j = 1; j <= fontno; j++) { + printf("%c. %s\n", j+96, fontid[j]); + } + n = asorti(charname, g) + row = -1 + col = -1 + cpr = 64 + for(j = 1; j <= n; j += 1) { + i = g[j] + nr = int(i/cpr) + nc = i%cpr + ns = 0 + for(k = 1; k <= fontno; k += 1) if(i SUBSEP k in exist) { + #print "["i","k"]", exist[i, k] + ns += 1 + nk = k + } else { + ak = k + } + #och = fontno == ns? "#": + och = 1 == ns? sprintf("%c", nk + 96): + fontno == ns + 1? sprintf("\x1B[7m%c\x1B[0m", ak + 96): + #ns <= 9? sprintf("%.1d", ns): + substr("▏▎▍▌▋▊▉█", int(ns*7/fontno+1), 1) + if(nr != row) { + row = nr; + printf("\n%06X ", row*cpr) + col = -1; + } + if(nc - col > 1) { + printf("%*s", nc - col-1, " ") + } + printf("%s",och) + col = nc + } + printf("\n") + printf(" Range "); + for(j = 1; j <= fontno; j += 1) { + printf(" %c", j + 96); + } + printf("\n"); + unicode_blocks(); + uc_range("Replacement character", 65533, 65533); + all_character_sets(); + all_console_sets(); + #ada_set("kyouiku_list"); + #ada_set("jouyou_kanji"); +}