+ character coverage analysis tool
This commit is contained in:
parent
865a2b0662
commit
a9df1cccaa
|
@ -1 +1,3 @@
|
|||
*.bak
|
||||
*.bak
|
||||
charsets/*.ucm
|
||||
charsets/Blocks.txt
|
10
Makefile
10
Makefile
|
@ -6,3 +6,13 @@ check-metrics-sans : check-metrics-sans-x check-metrics-sans-xb check-metrics-sa
|
|||
|
||||
check-metrics-sans-x check-metrics-sans-xb check-metrics-sans-xi : check-metrics-sans-x% :
|
||||
awk -f tools/check-metrics.awk sans/salut-sans[0-9][0-9]$*.bdf
|
||||
|
||||
.PHONY : check-coverage check-coverage-sans check-coverage-mono
|
||||
|
||||
check-coverage : check-coverage-sans check-coverage-mono
|
||||
|
||||
check-coverage-sans :
|
||||
awk -f tools/check-coverage.awk sans/salut-sans*.bdf
|
||||
|
||||
check-coverage-mono :
|
||||
awk -f tools/check-coverage.awk mono/salut-mono*.bdf
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
Place charset files here in UCM format.
|
||||
They keep their own copyright, so I do not include them in the repository.
|
||||
You can take them here:
|
||||
https://github.com/unicode-org/icu-data/tree/main/charset/data/ucm
|
||||
|
||||
It is also good to place unicode block list provided by unicode.org here.
|
||||
The link is https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt
|
||||
|
||||
(notice they can change the location, in this case please kinly bugreport broken link).
|
|
@ -0,0 +1,236 @@
|
|||
#!/usr/bin/gawk -f
|
||||
|
||||
BEGIN {
|
||||
hex_digits = "0123456789ABCDEF";
|
||||
# assigned glyph counts in blocks (referenced by lowest codepoint)
|
||||
ASSIGNED[hex("0000")] = 95; # ascii
|
||||
ASSIGNED[hex("0080")] = 96; # latin1
|
||||
ASSIGNED[hex("2000")] = 83; # general punct.
|
||||
}
|
||||
|
||||
function assert(expression, message) {
|
||||
if(!expression) {
|
||||
print "DEBUG ASSERTION FAILED "message > "/dev/stderr";
|
||||
exit 1;
|
||||
}
|
||||
}
|
||||
|
||||
function hex(x, i, y, d) {
|
||||
y = 0;
|
||||
for(i = 1; i <= length(x); i++) {
|
||||
d = index(hex_digits, toupper(substr(x, i, 1)));
|
||||
assert(d > 0 && d <= 16, "Hexadecimal character required");
|
||||
y = y * 16 + d - 1;
|
||||
}
|
||||
return y;
|
||||
}
|
||||
|
||||
function display_coverage_font(n, assigned) {
|
||||
if(n == 0) {
|
||||
return sprintf(" - ");
|
||||
} else if(n == assigned) {
|
||||
return sprintf(" ███");
|
||||
#} else if(0 < assigned - n && assigned - n < 10) {
|
||||
# return sprintf(" -%1d%c", assigned-n, missing);
|
||||
} else if(n <= 999) {
|
||||
return sprintf(" %3d", n);
|
||||
} else {
|
||||
return sprintf("%3%%", n*100/assigned);
|
||||
}
|
||||
}
|
||||
|
||||
function uc_range(name, first, last, assigned, j, k, n, missing, t) {
|
||||
if(!assigned) assigned = ASSIGNED[first];
|
||||
if(!assigned) assigned = last - first + 1;
|
||||
t = 0;
|
||||
output = "";
|
||||
for(j = 1; j <= fontno; j += 1) {
|
||||
n = 0;
|
||||
for(k = first; k <= last; k += 1) {
|
||||
ks = sprintf("%8d", k + 0)
|
||||
#print "["ks","j"]", (ks SUBSEP j in exist)
|
||||
if(ks SUBSEP j in exist) {
|
||||
#print "!"k","j"("exist[k SUBSEP j]")"
|
||||
n = n + 1;
|
||||
} else {
|
||||
missing = k;
|
||||
}
|
||||
}
|
||||
t = t + n;
|
||||
output = output display_coverage_font(n, assigned);
|
||||
}
|
||||
if(t > 0) {
|
||||
printf("%04X..%04X: %s [%d] %s\n", first, last, output, assigned, name);
|
||||
}
|
||||
}
|
||||
|
||||
function unicode_blocks(s, A) {
|
||||
while((getline s < "charsets/Blocks.txt") > 0) {
|
||||
if(match(s, "^([0-9A-F]+)\\.\\.([0-9A-F]+); (.*)$", A)) {
|
||||
uc_range(A[3], hex(A[1]), hex(A[2]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function character_set_common(charset, filename, pattern, line, j, k, ks, n, code, assigned, missing, least_missing_n, least_missing_j, least_missing, A, T) {
|
||||
printf("%-10.10s: ", charset);
|
||||
n = 0;
|
||||
PROCINFO["sorted_in"]="";
|
||||
#filename = "../charsets/"charset".ucm";
|
||||
#print filename;
|
||||
#system(" less "filename)
|
||||
assigned = 0;
|
||||
least_missing_n = 99999;
|
||||
while((getline line < filename) > 0) {
|
||||
#printf(".");
|
||||
if(match(line, pattern, A)) {
|
||||
code = hex(A[1]);
|
||||
if(code < 32 || code >= 127 && code < 160) {
|
||||
# skip control characters
|
||||
} else {
|
||||
#printf(" "code)
|
||||
if(!(code in T)) {
|
||||
T[code] = 1;
|
||||
assigned = assigned + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
close(filename);
|
||||
for(j = 1; j <= fontno; j += 1) {
|
||||
n = 0;
|
||||
for(k in T) {
|
||||
ks = sprintf("%8d", k + 0)
|
||||
#print "["ks","j"]", (ks SUBSEP j in exist)
|
||||
if(ks SUBSEP j in exist) {
|
||||
#print "!"k","j"("exist[k SUBSEP j]")"
|
||||
n = n + 1;
|
||||
} else {
|
||||
#printf("(%X)", k);
|
||||
missing = k;
|
||||
}
|
||||
}
|
||||
printf("%s", display_coverage_font(n, assigned));
|
||||
if(assigned - n > 0 && assigned - n < least_missing_n) {
|
||||
least_missing_n = assigned - n;
|
||||
least_missing = missing;
|
||||
least_missing_j = j;
|
||||
}
|
||||
}
|
||||
printf(" [%d] %s (%04X: %c @ %c)\n", assigned, charset, least_missing, least_missing, least_missing_j + 96);
|
||||
|
||||
}
|
||||
|
||||
function character_sets_in_dir(dir, postfix, pattern, list, s, charset) {
|
||||
list = "find '"dir"' -type f -name '*" postfix "'";
|
||||
while((list | getline s) > 0) {
|
||||
charset = s;
|
||||
sub("^.*/", "", charset);
|
||||
sub(postfix "$", "", charset);
|
||||
character_set_common(charset, s, pattern);
|
||||
}
|
||||
}
|
||||
|
||||
function all_character_sets() {
|
||||
character_sets_in_dir("charsets", ".ucm", "^<U([0-9A-F]+)>\\s+\\\\x[0-9A-F]+\\s+\\|0");
|
||||
}
|
||||
|
||||
function console_set(charset) {
|
||||
character_set_common(charset, "/usr/share/bdf2psf/"charset, "^U\\+([0-9A-F]+)$");
|
||||
}
|
||||
|
||||
function all_console_sets(dir, pattern) {
|
||||
dir = "/usr/share/bdf2psf";
|
||||
pattern = "^U\\+([0-9A-Fa-f]+)($|\\s)";
|
||||
character_sets_in_dir(dir, ".set", pattern);
|
||||
character_sets_in_dir(dir, ".256", pattern);
|
||||
character_sets_in_dir(dir, ".512", pattern);
|
||||
}
|
||||
|
||||
#function ada_set(charset) {
|
||||
# character_set_common(charset, "../charsets/"charset".hex", "16\\#([0-9A-F]+)\#");
|
||||
#}
|
||||
|
||||
BEGIN {
|
||||
}
|
||||
|
||||
$1 == "FONT" {
|
||||
fontno += 1
|
||||
fontname[fontno] = $2
|
||||
fontid[fontno] = FILENAME
|
||||
name = ""
|
||||
code = ""
|
||||
}
|
||||
|
||||
$1 == "PIXEL_SIZE" {
|
||||
pixelsize[fontno] = $2
|
||||
}
|
||||
|
||||
$1 == "STARTCHAR" {
|
||||
name = substr($0, 10)
|
||||
}
|
||||
|
||||
$1 == "ENCODING" {
|
||||
code = sprintf("%8d", $2 + 0)
|
||||
charname[code] = name
|
||||
exist[code,fontno] = 1
|
||||
}
|
||||
|
||||
BEGIN {
|
||||
PROCINFO["sorted_in"] = "@ind_val_asc"
|
||||
threshold = 0.707
|
||||
}
|
||||
|
||||
function abs(x) {
|
||||
return x<0? -x: x
|
||||
}
|
||||
|
||||
END {
|
||||
for(j = 1; j <= fontno; j++) {
|
||||
printf("%c. %s\n", j+96, fontid[j]);
|
||||
}
|
||||
n = asorti(charname, g)
|
||||
row = -1
|
||||
col = -1
|
||||
cpr = 64
|
||||
for(j = 1; j <= n; j += 1) {
|
||||
i = g[j]
|
||||
nr = int(i/cpr)
|
||||
nc = i%cpr
|
||||
ns = 0
|
||||
for(k = 1; k <= fontno; k += 1) if(i SUBSEP k in exist) {
|
||||
#print "["i","k"]", exist[i, k]
|
||||
ns += 1
|
||||
nk = k
|
||||
} else {
|
||||
ak = k
|
||||
}
|
||||
#och = fontno == ns? "#":
|
||||
och = 1 == ns? sprintf("%c", nk + 96):
|
||||
fontno == ns + 1? sprintf("\x1B[7m%c\x1B[0m", ak + 96):
|
||||
#ns <= 9? sprintf("%.1d", ns):
|
||||
substr("▏▎▍▌▋▊▉█", int(ns*7/fontno+1), 1)
|
||||
if(nr != row) {
|
||||
row = nr;
|
||||
printf("\n%06X ", row*cpr)
|
||||
col = -1;
|
||||
}
|
||||
if(nc - col > 1) {
|
||||
printf("%*s", nc - col-1, " ")
|
||||
}
|
||||
printf("%s",och)
|
||||
col = nc
|
||||
}
|
||||
printf("\n")
|
||||
printf(" Range ");
|
||||
for(j = 1; j <= fontno; j += 1) {
|
||||
printf(" %c", j + 96);
|
||||
}
|
||||
printf("\n");
|
||||
unicode_blocks();
|
||||
uc_range("Replacement character", 65533, 65533);
|
||||
all_character_sets();
|
||||
all_console_sets();
|
||||
#ada_set("kyouiku_list");
|
||||
#ada_set("jouyou_kanji");
|
||||
}
|
Loading…
Reference in New Issue