+ character coverage analysis tool

This commit is contained in:
Vovanium 2022-04-15 18:32:10 +03:00
parent 865a2b0662
commit a9df1cccaa
4 changed files with 258 additions and 1 deletions

4
.gitignore vendored
View File

@ -1 +1,3 @@
*.bak
*.bak
charsets/*.ucm
charsets/Blocks.txt

View File

@ -6,3 +6,13 @@ check-metrics-sans : check-metrics-sans-x check-metrics-sans-xb check-metrics-sa
check-metrics-sans-x check-metrics-sans-xb check-metrics-sans-xi : check-metrics-sans-x% :
awk -f tools/check-metrics.awk sans/salut-sans[0-9][0-9]$*.bdf
.PHONY : check-coverage check-coverage-sans check-coverage-mono
check-coverage : check-coverage-sans check-coverage-mono
check-coverage-sans :
awk -f tools/check-coverage.awk sans/salut-sans*.bdf
check-coverage-mono :
awk -f tools/check-coverage.awk mono/salut-mono*.bdf

9
charsets/README.charsets Normal file
View File

@ -0,0 +1,9 @@
Place charset files here in UCM format.
They keep their own copyright, so I do not include them in the repository.
You can take them here:
https://github.com/unicode-org/icu-data/tree/main/charset/data/ucm
It is also good to place unicode block list provided by unicode.org here.
The link is https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt
(notice they can change the location, in this case please kinly bugreport broken link).

236
tools/check-coverage.awk Executable file
View File

@ -0,0 +1,236 @@
#!/usr/bin/gawk -f
BEGIN {
hex_digits = "0123456789ABCDEF";
# assigned glyph counts in blocks (referenced by lowest codepoint)
ASSIGNED[hex("0000")] = 95; # ascii
ASSIGNED[hex("0080")] = 96; # latin1
ASSIGNED[hex("2000")] = 83; # general punct.
}
function assert(expression, message) {
if(!expression) {
print "DEBUG ASSERTION FAILED "message > "/dev/stderr";
exit 1;
}
}
function hex(x, i, y, d) {
y = 0;
for(i = 1; i <= length(x); i++) {
d = index(hex_digits, toupper(substr(x, i, 1)));
assert(d > 0 && d <= 16, "Hexadecimal character required");
y = y * 16 + d - 1;
}
return y;
}
function display_coverage_font(n, assigned) {
if(n == 0) {
return sprintf(" - ");
} else if(n == assigned) {
return sprintf(" ███");
#} else if(0 < assigned - n && assigned - n < 10) {
# return sprintf(" -%1d%c", assigned-n, missing);
} else if(n <= 999) {
return sprintf(" %3d", n);
} else {
return sprintf("%3%%", n*100/assigned);
}
}
function uc_range(name, first, last, assigned, j, k, n, missing, t) {
if(!assigned) assigned = ASSIGNED[first];
if(!assigned) assigned = last - first + 1;
t = 0;
output = "";
for(j = 1; j <= fontno; j += 1) {
n = 0;
for(k = first; k <= last; k += 1) {
ks = sprintf("%8d", k + 0)
#print "["ks","j"]", (ks SUBSEP j in exist)
if(ks SUBSEP j in exist) {
#print "!"k","j"("exist[k SUBSEP j]")"
n = n + 1;
} else {
missing = k;
}
}
t = t + n;
output = output display_coverage_font(n, assigned);
}
if(t > 0) {
printf("%04X..%04X: %s [%d] %s\n", first, last, output, assigned, name);
}
}
function unicode_blocks(s, A) {
while((getline s < "charsets/Blocks.txt") > 0) {
if(match(s, "^([0-9A-F]+)\\.\\.([0-9A-F]+); (.*)$", A)) {
uc_range(A[3], hex(A[1]), hex(A[2]));
}
}
}
function character_set_common(charset, filename, pattern, line, j, k, ks, n, code, assigned, missing, least_missing_n, least_missing_j, least_missing, A, T) {
printf("%-10.10s: ", charset);
n = 0;
PROCINFO["sorted_in"]="";
#filename = "../charsets/"charset".ucm";
#print filename;
#system(" less "filename)
assigned = 0;
least_missing_n = 99999;
while((getline line < filename) > 0) {
#printf(".");
if(match(line, pattern, A)) {
code = hex(A[1]);
if(code < 32 || code >= 127 && code < 160) {
# skip control characters
} else {
#printf(" "code)
if(!(code in T)) {
T[code] = 1;
assigned = assigned + 1;
}
}
}
}
close(filename);
for(j = 1; j <= fontno; j += 1) {
n = 0;
for(k in T) {
ks = sprintf("%8d", k + 0)
#print "["ks","j"]", (ks SUBSEP j in exist)
if(ks SUBSEP j in exist) {
#print "!"k","j"("exist[k SUBSEP j]")"
n = n + 1;
} else {
#printf("(%X)", k);
missing = k;
}
}
printf("%s", display_coverage_font(n, assigned));
if(assigned - n > 0 && assigned - n < least_missing_n) {
least_missing_n = assigned - n;
least_missing = missing;
least_missing_j = j;
}
}
printf(" [%d] %s (%04X: %c @ %c)\n", assigned, charset, least_missing, least_missing, least_missing_j + 96);
}
function character_sets_in_dir(dir, postfix, pattern, list, s, charset) {
list = "find '"dir"' -type f -name '*" postfix "'";
while((list | getline s) > 0) {
charset = s;
sub("^.*/", "", charset);
sub(postfix "$", "", charset);
character_set_common(charset, s, pattern);
}
}
function all_character_sets() {
character_sets_in_dir("charsets", ".ucm", "^<U([0-9A-F]+)>\\s+\\\\x[0-9A-F]+\\s+\\|0");
}
function console_set(charset) {
character_set_common(charset, "/usr/share/bdf2psf/"charset, "^U\\+([0-9A-F]+)$");
}
function all_console_sets(dir, pattern) {
dir = "/usr/share/bdf2psf";
pattern = "^U\\+([0-9A-Fa-f]+)($|\\s)";
character_sets_in_dir(dir, ".set", pattern);
character_sets_in_dir(dir, ".256", pattern);
character_sets_in_dir(dir, ".512", pattern);
}
#function ada_set(charset) {
# character_set_common(charset, "../charsets/"charset".hex", "16\\#([0-9A-F]+)\#");
#}
BEGIN {
}
$1 == "FONT" {
fontno += 1
fontname[fontno] = $2
fontid[fontno] = FILENAME
name = ""
code = ""
}
$1 == "PIXEL_SIZE" {
pixelsize[fontno] = $2
}
$1 == "STARTCHAR" {
name = substr($0, 10)
}
$1 == "ENCODING" {
code = sprintf("%8d", $2 + 0)
charname[code] = name
exist[code,fontno] = 1
}
BEGIN {
PROCINFO["sorted_in"] = "@ind_val_asc"
threshold = 0.707
}
function abs(x) {
return x<0? -x: x
}
END {
for(j = 1; j <= fontno; j++) {
printf("%c. %s\n", j+96, fontid[j]);
}
n = asorti(charname, g)
row = -1
col = -1
cpr = 64
for(j = 1; j <= n; j += 1) {
i = g[j]
nr = int(i/cpr)
nc = i%cpr
ns = 0
for(k = 1; k <= fontno; k += 1) if(i SUBSEP k in exist) {
#print "["i","k"]", exist[i, k]
ns += 1
nk = k
} else {
ak = k
}
#och = fontno == ns? "#":
och = 1 == ns? sprintf("%c", nk + 96):
fontno == ns + 1? sprintf("\x1B[7m%c\x1B[0m", ak + 96):
#ns <= 9? sprintf("%.1d", ns):
substr("▏▎▍▌▋▊▉█", int(ns*7/fontno+1), 1)
if(nr != row) {
row = nr;
printf("\n%06X ", row*cpr)
col = -1;
}
if(nc - col > 1) {
printf("%*s", nc - col-1, " ")
}
printf("%s",och)
col = nc
}
printf("\n")
printf(" Range ");
for(j = 1; j <= fontno; j += 1) {
printf(" %c", j + 96);
}
printf("\n");
unicode_blocks();
uc_range("Replacement character", 65533, 65533);
all_character_sets();
all_console_sets();
#ada_set("kyouiku_list");
#ada_set("jouyou_kanji");
}