78 lines
2.0 KiB
JavaScript
78 lines
2.0 KiB
JavaScript
/**
|
||
* Imports data and builds the library
|
||
*/
|
||
|
||
const fs = require("fs");
|
||
const { execSync } = require("child_process");
|
||
|
||
const DOCS_CMD = "find . -type f -name *.md | xargs cat $1";
|
||
// const docs = exports.docs = execSync(DOCS_CMD, {timeout: 0, maxBuffer: 1024 * 1024 * 1024}).toString().replace(/\n```[\s\S]+?\n```\n/g, "");
|
||
|
||
const text = exports.text = fs.readdirSync("text")
|
||
.filter(filename => /\.txt$/.test(filename))
|
||
.map(filename => fs.readFileSync(`text/${filename}`))
|
||
.join("\n\n");
|
||
|
||
const CODE_CMD = "find . -type f -name *.js | xargs cat $1";
|
||
const code = exports.code = execSync(CODE_CMD, {timeout: 0, maxBuffer: 1024 * 1024 * 1024}).toString();
|
||
|
||
// FIXME (no source, incorrect weights)
|
||
const POPULAR_TRIGRAMS = {
|
||
cnd: 80000000, // ств
|
||
cnj: 59623899, // сто
|
||
tyj: 27088636, // ено
|
||
yjd: 19494469, // нов
|
||
njd: 13977786, // тов
|
||
jdj: 11059185, // ово
|
||
tdf: 10141992, // ева
|
||
jdf: 10141992, // ова
|
||
};
|
||
|
||
// FIXME (no source, incorrect weights)
|
||
const POPULAR_BIGRAMS = {
|
||
cn: 92535489, // ст
|
||
yj: 87741289, // но
|
||
ty: 54433847, // ен
|
||
nj: 51910883, // то
|
||
yf: 51015163, // на
|
||
jd: 41694599, // ов
|
||
yb: 37466077, // ни
|
||
hf: 33802063, // ра
|
||
dj: 32967758, // во
|
||
rj: 31830493, // ко
|
||
};
|
||
|
||
const trigrams = exports.trigrams = generate_text_from(POPULAR_TRIGRAMS);
|
||
const bigrams = exports.bigrams = generate_text_from(POPULAR_BIGRAMS);
|
||
|
||
function generate_text_from(dictionary) {
|
||
let total = 0;
|
||
|
||
for (let key in dictionary) {
|
||
total += dictionary[key];
|
||
}
|
||
|
||
let tokens = [];
|
||
for (let key in dictionary) {
|
||
const percent = Math.round(dictionary[key] / total * 100);
|
||
|
||
tokens.push(repeat(key, percent));
|
||
tokens.push(repeat(titleize(key), percent));
|
||
}
|
||
|
||
return tokens.join(" ");
|
||
}
|
||
|
||
|
||
function repeat(string, times) {
|
||
let result = [];
|
||
for (let i=0; i < times; i++) {
|
||
result.push(string);
|
||
}
|
||
return result.join(" ");
|
||
}
|
||
|
||
function titleize(str) {
|
||
return str[0].toUpperCase() + str.substr(1);
|
||
}
|