2016-02-25 01:12:42 +01:00
|
|
|
|
/**
|
|
|
|
|
* Imports data and builds the library
|
|
|
|
|
*/
|
|
|
|
|
|
2016-03-25 02:39:35 +01:00
|
|
|
|
const fs = require("fs");
|
2016-03-28 02:36:51 +02:00
|
|
|
|
const { execSync } = require("child_process");
|
2016-02-25 01:12:42 +01:00
|
|
|
|
|
2016-03-28 02:36:51 +02:00
|
|
|
|
const DOCS_CMD = "find . -type f -name *.md | xargs cat $1";
|
2016-03-29 12:58:31 +02:00
|
|
|
|
// const docs = exports.docs = execSync(DOCS_CMD, {timeout: 0, maxBuffer: 1024 * 1024 * 1024}).toString().replace(/\n```[\s\S]+?\n```\n/g, "");
|
2016-03-16 07:18:39 +01:00
|
|
|
|
|
2016-10-24 11:08:27 +02:00
|
|
|
|
const text = exports.text = fs.readdirSync("text")
|
|
|
|
|
.filter(filename => /\.txt$/.test(filename))
|
|
|
|
|
.map(filename => fs.readFileSync(`text/${filename}`))
|
|
|
|
|
.join("\n\n");
|
2016-03-01 11:18:50 +01:00
|
|
|
|
|
2016-03-28 02:36:51 +02:00
|
|
|
|
const CODE_CMD = "find . -type f -name *.js | xargs cat $1";
|
2016-06-26 09:32:00 +02:00
|
|
|
|
const code = exports.code = execSync(CODE_CMD, {timeout: 0, maxBuffer: 1024 * 1024 * 1024}).toString();
|
2016-02-25 01:12:42 +01:00
|
|
|
|
|
2022-08-18 20:53:45 +02:00
|
|
|
|
// FIXME (no source, incorrect weights)
|
2016-02-28 11:49:56 +01:00
|
|
|
|
const POPULAR_TRIGRAMS = {
|
2022-10-20 19:01:54 +02:00
|
|
|
|
cnd: 80000000, // ств
|
2022-08-18 20:53:45 +02:00
|
|
|
|
cnj: 59623899, // сто
|
|
|
|
|
tyj: 27088636, // ено
|
|
|
|
|
yjd: 19494469, // нов
|
|
|
|
|
njd: 13977786, // тов
|
|
|
|
|
jdj: 11059185, // ово
|
2022-10-20 19:01:54 +02:00
|
|
|
|
tdf: 10141992, // ева
|
2022-08-18 20:53:45 +02:00
|
|
|
|
jdf: 10141992, // ова
|
2016-02-28 11:49:56 +01:00
|
|
|
|
};
|
|
|
|
|
|
2022-08-18 20:53:45 +02:00
|
|
|
|
// FIXME (no source, incorrect weights)
|
2016-02-28 11:49:56 +01:00
|
|
|
|
const POPULAR_BIGRAMS = {
|
2022-08-18 20:53:45 +02:00
|
|
|
|
cn: 92535489, // ст
|
|
|
|
|
yj: 87741289, // но
|
|
|
|
|
ty: 54433847, // ен
|
|
|
|
|
nj: 51910883, // то
|
|
|
|
|
yf: 51015163, // на
|
|
|
|
|
jd: 41694599, // ов
|
|
|
|
|
yb: 37466077, // ни
|
|
|
|
|
hf: 33802063, // ра
|
|
|
|
|
dj: 32967758, // во
|
|
|
|
|
rj: 31830493, // ко
|
2016-02-28 11:49:56 +01:00
|
|
|
|
};
|
|
|
|
|
|
2016-03-23 11:38:18 +01:00
|
|
|
|
const trigrams = exports.trigrams = generate_text_from(POPULAR_TRIGRAMS);
|
|
|
|
|
const bigrams = exports.bigrams = generate_text_from(POPULAR_BIGRAMS);
|
2016-02-28 11:49:56 +01:00
|
|
|
|
|
|
|
|
|
function generate_text_from(dictionary) {
|
|
|
|
|
let total = 0;
|
|
|
|
|
|
|
|
|
|
for (let key in dictionary) {
|
|
|
|
|
total += dictionary[key];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let tokens = [];
|
|
|
|
|
for (let key in dictionary) {
|
|
|
|
|
const percent = Math.round(dictionary[key] / total * 100);
|
|
|
|
|
|
|
|
|
|
tokens.push(repeat(key, percent));
|
|
|
|
|
tokens.push(repeat(titleize(key), percent));
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-23 11:38:18 +01:00
|
|
|
|
return tokens.join(" ");
|
2016-02-28 11:49:56 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function repeat(string, times) {
|
|
|
|
|
let result = [];
|
|
|
|
|
for (let i=0; i < times; i++) {
|
|
|
|
|
result.push(string);
|
|
|
|
|
}
|
|
|
|
|
return result.join(" ");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function titleize(str) {
|
|
|
|
|
return str[0].toUpperCase() + str.substr(1);
|
|
|
|
|
}
|