csv_parse = require "csv-parse/sync"
csv_stringify = require "csv-stringify/sync"
coffee = require "coffeescript"
fs = require "fs"
hanzi_tools = require "hanzi-tools"
html_parser = require "node-html-parser"
http = require "https"
node_path = require "path"
pinyin_split = require "pinyin-split"
pinyin_utils = require "pinyin-utils"
{DOMParser, XMLSerializer} = require "xmldom"
#scraper = require "table-scraper"
read_text_file = (a) -> fs.readFileSync a, "utf8"
read_csv_file = (path, delimiter) -> csv_parse.parse read_text_file(path), {delimiter: delimiter || " ", relax_column_count: true}
replace_placeholders = (text, mapping) -> text.replace /__(.*?)__/g, (_, k) -> mapping[k] or ""
array_from_newline_file = (path) -> read_text_file(path).toString().trim().split("\n")
on_error = (a) -> if a then console.error a
delete_duplicates = (a) -> [...new Set(a)]
split_chars = (a) -> [...a]
random_integer = (min, max) -> Math.floor(Math.random() * (max - min + 1)) + min
random_element = (a) -> a[random_integer 0, a.length - 1]
n_times = (n, f) -> [...Array(n).keys()].map f
remove_non_chinese_characters = (a) -> a.replace /[^\p{Script=Han}]/ug, ""
traditional_to_simplified = (a) -> hanzi_tools.simplify a
pinyin_split2 = (a) -> a.replace(/[0-5]/g, (a) -> a + " ").trim().split " "
median = (a) -> a.slice().sort((a, b) -> a - b)[Math.floor(a.length / 2)]
sum = (a) -> a.reduce ((a, b) -> a + b), 0
mean = (a) -> sum(a) / a.length
object_array_add = (object, key, value) -> if object[key] then object[key].push value else object[key] = [value]
object_array_add_unique = (object, key, value) ->
if object[key] then object[key].push value unless object[key].includes value
else object[key] = [value]
array_intersection = (a, b) -> a.filter (a) -> b.includes(a)
write_csv_file = (path, data) ->
csv = csv_stringify.stringify(data, {delimiter: " "}, on_error).trim()
fs.writeFile path, csv, on_error
delete_duplicates_stable = (a) ->
result = []
existing = {}
a.forEach (a) ->
unless existing[a]
existing[a] = true
result.push a
result
delete_duplicates_stable_with_key = (a, key) ->
result = []
existing = {}
a.forEach (a) ->
unless existing[a[key]]
existing[a[key]] = true
result.push a
result
lcg = (seed) ->
m = 2 ** 31
a = 1103515245
c = 12345
state = seed
->
state = (a * state + c) % m
state / m
array_shuffle = (a) ->
rand = lcg(23465700980)
n = a.length
while n > 0
i = Math.floor rand() * n
n -= 1
[a[n], a[i]] = [a[i], a[n]]
a
array_deduplicate_key = (a, get_key) ->
existing = {}
a.filter (a) ->
key = get_key a
if existing[key] then false
else
existing[key] = true
true
# https://en.wiktionary.org/wiki/Appendix:Unicode
hanzi_unicode_ranges = [
["30A0", "30FF"] # katakana used for some components
["2E80", "2EFF"]
["31C0", "31EF"]
["4E00", "9FFF"]
["3400", "4DBF"]
["20000", "2A6DF"]
["2A700", "2B73F"]
["2B740", "2B81F"]
["2B820", "2CEAF"]
["2CEB0", "2EBEF"]
["30000", "3134F"]
["31350", "323AF"]
["2EBF0", "2EE5F"]
]
unicode_ranges_pattern = (a, is_reject) -> "[" + (if is_reject then "^" else "") + a.map((a) -> a.map((b) -> "\\u{#{b}}").join("-")).join("") + "]"
unicode_ranges_regexp = (a, is_reject) -> new RegExp unicode_ranges_pattern(a, is_reject), "gu"
hanzi_regexp = unicode_ranges_regexp hanzi_unicode_ranges
non_hanzi_regexp = unicode_ranges_regexp hanzi_unicode_ranges, true
hanzi_and_idc_regexp = unicode_ranges_regexp hanzi_unicode_ranges.concat([["2FF0", "2FFF"]])
non_pinyin_regexp = /[^a-z0-5]/g
get_word_frequency_index = () ->
# -> {"#{word}#{pinyin}": integer}
frequency = array_from_newline_file "data/words-by-frequency.txt"
frequency_index = {}
frequency.forEach (a, i) ->
a = a.replace " ", ""
frequency_index[a] = i unless frequency_index[a]
frequency_index
get_word_frequency_index_with_pinyin = () ->
# -> {"#{word}#{pinyin}": integer}
frequency = array_from_newline_file "data/words-by-frequency-with-pinyin.csv"
frequency_index = {}
frequency.forEach (a, i) ->
a = a.replace " ", ""
frequency_index[a] = i unless frequency_index[a]
frequency_index
get_all_standard_characters = () -> read_csv_file("data/table-of-general-standard-chinese-characters.csv").map (a) -> a[0]
get_all_standard_characters_with_pinyin = () ->
a = read_csv_file("data/table-of-general-standard-chinese-characters.csv").map (a) -> [a[0], a[1].split(",")[0]]
b = read_csv_file("data/additional-characters.csv").filter((a) -> !character_exclusions.includes(a[0])).map (a) -> [a[0], a[1].split(",")[0]]
a.concat b
get_all_characters = () -> read_csv_file("data/characters-strokes-decomposition.csv").map (a) -> a[0]
display_all_characters = () -> console.log get_all_characters().join("")
get_all_characters_with_pinyin = () ->
dict = dictionary_index_word_f 0
result = []
chars = {}
for a in read_csv_file "data/table-of-general-standard-chinese-characters.csv"
pinyin = a[1].split(", ")[0]
chars[a[0]] = pinyin
for a in read_csv_file "data/additional-characters.csv"
chars[a[0]] = a[1] unless chars[a[0]]
for a in read_csv_file "data/characters-strokes-decomposition.csv"
pinyin = dict(a[0])?[0][1]
chars[a[0]] = pinyin if pinyin && !chars[a[0]]
continue if a.length < 3
for b in split_chars(a[2])
continue unless b.match hanzi_regexp
pinyin = dict(b)?[0][1]
chars[b] = pinyin if pinyin && !chars[b]
data = ([a, b] for a, b of chars)
char_index = split_chars read_text_file("data/characters-by-frequency.txt").trim()
data.sort (a, b) ->
ia = char_index.indexOf a[0]
ib = char_index.indexOf b[0]
(if ia is -1 then Infinity else ia) - (if ib is -1 then Infinity else ib)
data
get_character_by_reading_index = () ->
chars = get_all_characters_with_pinyin()
result = {}
chars.forEach (a) -> object_array_add result, a[1], a[0]
result
get_frequency_characters_and_pinyin = () ->
# with duplicates. use case: count character reading frequency
result = []
a = read_csv_file "data/words-by-frequency-with-pinyin.csv"
a.forEach (a) ->
chars = split_chars a[0]
pinyin = pinyin_split2 a[1]
chars.forEach (a, i) -> result.push [a, pinyin[i]]
result
get_all_characters_sorted_by_frequency = () ->
delete_duplicates_stable get_all_characters_with_pinyin().map (a) -> split_chars(a[0])[0]
get_character_frequency_index = () ->
# -> {character: integer}
chars = get_all_characters_sorted_by_frequency()
frequency_index = {}
chars.forEach (a, i) -> frequency_index[a] = i
frequency_index
get_character_pinyin_frequency_index = () ->
# -> {character + pinyin: integer}
chars = get_frequency_characters_and_pinyin()
result = {}
index = 0
chars.forEach (a) ->
key = a[0] + (a[1] || "")
unless result[key]
result[key] = index
index += 1
result
update_character_reading_count = () ->
# counts how common different readings are for characters
index = {}
rows = []
chars = get_all_characters()
chars_and_pinyin = get_frequency_characters_and_pinyin()
chars.forEach (a) ->
chars_and_pinyin.forEach (b) ->
if a[0] is b[0]
key = a[0] + b[1]
if index[key] != undefined then index[key] += 1
else index[key] = 0
Object.keys(index).forEach (a) ->
count = index[a]
if count then rows.push [a[0], a.slice(1), count]
rows = rows.sort (a, b) -> b[2] - a[2]
write_csv_file "data/characters-pinyin-count.csv", rows
sort_by_index_and_character_f = (index, character_key) ->
# {character: integer, ...}, any -> function(a, b)
f = sort_by_character_f index
(a, b) -> f a[character_key], b[character_key]
sort_by_character_f = (index) ->
(a, b) ->
ia = index[a]
ib = index[b]
if ia is undefined and ib is undefined
(a.length - b.length) || a.localeCompare(b) || b.localeCompare(a)
else if ia is undefined then 1
else if ib is undefined then -1
else ia - ib
sort_by_character_frequency = (frequency_index, character_key, data) ->
data.sort sort_by_index_and_character_f frequency_index, character_key
sort_by_stroke_count = (stroke_count_index, character_key, data) ->
data.sort sort_by_index_and_character_f stroke_count_index, character_key
sort_by_word_frequency_with_pinyin = (frequency_index, word_key, pinyin_key, data) ->
data.sort (a, b) ->
fa = frequency_index[a[word_key] + a[pinyin_key]]
fb = frequency_index[b[word_key] + b[pinyin_key]]
if fa is undefined and fb is undefined
a[word_key].length - b[word_key].length
else if fa is undefined
1
else if fb is undefined
-1
else
fa - fb
sort_by_word_frequency = (frequency_index, word_key, data) ->
data.sort (a, b) ->
fa = frequency_index[a[word_key]]
fb = frequency_index[b[word_key]]
if fa is undefined and fb is undefined
a[word_key].length - b[word_key].length
else if fa is undefined
1
else if fb is undefined
-1
else
fa - fb
dictionary_cedict_to_json = (data) ->
JSON.stringify data.map (a) ->
a[2] = a[2].split "/"
a.push a[1].replace /[0-4]/g, ""
a
update_dictionary = () ->
word_data = read_csv_file "data/cedict.csv"
word_data = dictionary_cedict_to_json word_data
character_data = read_text_file "data/characters-svg.json"
script = read_text_file "src/dictionary.coffee"
script = coffee.compile(script, bare: true).trim()
script = replace_placeholders script, {word_data, character_data}
font = read_text_file "src/NotoSansSC-Light.ttf.base64"
html = read_text_file "src/hanyu-dictionary-template.html"
html = replace_placeholders html, {font, script}
fs.writeFileSync "compiled/hanyu-dictionary.html", html
clean_frequency_list = () ->
frequency_array = array_from_newline_file "data/words-by-frequency.txt"
frequency_array = frequency_array.filter (a) ->
traditional_to_simplified remove_non_chinese_characters a
frequency_array.forEach (a) -> console.log a
dictionary_index_word_f = (lookup_index) ->
dictionary = {}
read_csv_file("data/cedict.csv").forEach (a) -> object_array_add dictionary, a[lookup_index], a
(a) -> dictionary[a]
dictionary_index_word_pinyin_f = () ->
dictionary = {}
word_index = 0
pinyin_index = 1
words = read_csv_file "data/cedict.csv"
words.forEach (a) ->
word = a[word_index]
key = a[word_index] + a[pinyin_index]
object_array_add dictionary, key, a
object_array_add dictionary, word, a
(word, pinyin) -> dictionary[word + pinyin]
mark_to_number = (a) ->
a.split(" ").map((a) -> pinyin_split2(a).map(pinyin_utils.markToNumber).join("")).join(" ")
find_multiple_word_matches = (a, lookup_index, translation_index, split_syllables) ->
# for each space separated element, find all longest most frequent words with the pronunciation.
dictionary_lookup = dictionary_index_word_f lookup_index
results = []
a.split(" ").forEach (a) ->
syllables = split_syllables a
max_word_length = 5
per_length = (i, j) -> syllables.slice(i, j).join("")
per_syllable = (i) ->
end = Math.min(i + max_word_length, syllables.length) + 1
per_length i, j for j in [(i + 1)...end]
candidates = (per_syllable i for i in [0...syllables.length])
i = 0
while i < candidates.length
matches = []
j = 0
reversed_candidates = candidates[i].toReversed()
while j < reversed_candidates.length
translations = dictionary_lookup reversed_candidates[j]
if translations
matches.push translations.map((a) -> a[translation_index]).join "/"
break
j += 1
if matches.length
results.push matches[0]
i += reversed_candidates.length - j
else
results.push candidates[i][0]
i += 1
results.join " "
pinyin_to_hanzi = (a) ->
a = a.replace(non_pinyin_regexp, " ").trim()
find_multiple_word_matches a, 1, 0, pinyin_split2
hanzi_to_pinyin = (a) ->
a = a.replace(non_hanzi_regexp, " ").trim()
find_multiple_word_matches a, 0, 1, split_chars
get_character_pinyin_index = ->
index = {}
chars = get_all_characters_with_pinyin().filter((a) -> !a[1].endsWith("5"))
chars.forEach (a) -> index[a[0]] = a[1].split(",")[0]
index
get_character_tone_index = ->
index = {}
chars = get_all_characters_with_pinyin().filter((a) -> !a[1].endsWith("5"))
chars.forEach (a) -> index[a[0]] = parseInt a[1][a[1].length - 1]
index
get_characters_by_pinyin_rows = ->
by_pinyin = {}
chars = get_all_characters_with_pinyin().filter((a) -> !a[1].endsWith("5"))
chars.forEach (a) -> object_array_add by_pinyin, a[1], a[0]
rows = Object.keys(by_pinyin).map (a) -> [a, by_pinyin[a]]
rows.sort (a, b) -> a[0].localeCompare(b[0]) || b[1].length - a[1].length
all_syllables = """
a ai an ang ao ba bai ban bang bao bei ben beng bi bian biang biao bie bin bing bo bu
ca cai can cang cao ce cei cen ceng cha chai chan chang chao che chen cheng chi chong
chou chu chua chuai chuan chuang chui chun chuo ci cong cou cu cuan cui cun cuo da dai
dan dang dao de dei den deng di dian diao die ding diu dong dou du duan dui dun duo e ei
en eng er fa fan fang fei fen feng fo fou fu ga gai gan gang gao ge gei gen geng gong gou
gu gua guai guan guang gui gun guo ha hai han hang hao he hei hen heng hong hou hu hua
huai huan huang hui hun huo ji jia jian jiang jiao jie jin jing jiong jiu ju juan jue jun
ka kai kan kang kao ke kei ken keng kong kou ku kua kuai kuan kuang kui kun kuo la lai
lan lang lao le lei leng li lia lian liang liao lie lin ling liu lo long lou lu luan lun
luo lü lüe ma mai man mang mao me mei men meng mi mian miao mie min ming miu mo mou mu
na nai nan nang nao ne nei nen neng ni nian niang niao nie nin ning niu nong nou nu nuan
nuo nü nüe o ou pa pai pan pang pao pei pen peng pi pian piao pie pin ping po pou pu qi
qia qian qiang qiao qie qin qing qiong qiu qu quan que qun ran rang rao re ren reng ri
rong rou ru rua ruan rui run ruo sa sai san sang sao se sen seng sha shai shan shang shao
she shei shen sheng shi shou shu shua shuai shuan shuang shui shun shuo si song sou su
suan sui sun suo ta tai tan tang tao te teng ti tian tiao tie ting tong tou tu tuan tui
tun tuo wa wai wan wang wei wen weng wo wu xi xia xian xiang xiao xie xin xing xiong xiu
xu xuan xue xun ya yan yang yao ye yi yin ying yong you yu yuan yue yun za zai zan zang
zao ze zei zen zeng zha zhai zhan zhang zhao zhe zhei zhen zheng zhi zhong zhou zhu zhua
zhuai zhuan zhuang zhui zhun zhuo zi zong zou zu zuan zui zun zuo
""".split " "
circle_arrows = ["→","↗","↑","↖","←","↙","↓","↘"]
get_syllable_circle_arrow = (s) ->
s = s.replace(/[0-5]$/, "")
i = all_syllables.indexOf s
circle_arrows[(Math.round(8 * i / all_syllables.length)) % 8]
class_for_tone = (tone) -> "tone#{tone}"
build_prelearn = ->
prelearn = read_csv_file("/home/nonroot/chinese/1/lists/prelearn.csv").map (a) -> [a[0], a[1]]
groups = {}
for a in prelearn
object_array_add groups, a[1], a[0]
result = []
for k, v of groups
arrow = get_syllable_circle_arrow k
result.push [k + arrow, v.join("")]
result
build_pinyin_sets = ->
rows = get_characters_by_pinyin_rows()
flat = ([a[0], a[1].join("")] for a in rows)
by_count = flat.slice().sort (a, b) -> a[1].length - b[1].length
[flat, by_count]
build_contained = (tone_index, pinyin_index) ->
rows = get_characters_contained_rows()
([a[0], ([c, tone_index[c]] for c in a[1])] for a in rows)
render_row = ([label, data]) ->
if typeof data is "string"
"#{label}#{data}"
else
"#{label}#{data}"
#chars = data.map ([c, t]) -> "#{c}"
#"#{label}#{chars.join("")}"
update_character_tables_html = (tables) ->
nav_links = []
i = 0
make_table = (rows, name) ->
nav_links.push "#{name}"
i += 1
"
" + (rows.map render_row).join("\n") + "
"
content = (make_table v, k for k, v of tables).join "\n"
[content, nav_links.join("\n")]
get_characters_by_pinyin_rows_flat = ->
result = []
for a in get_characters_by_pinyin_rows()
for b in a[1]
result.push [a[0], b]
result
update_character_tables = ->
tone_index = get_character_tone_index()
pinyin_index = get_character_pinyin_index()
[pinyin, pinyin_by_count] = build_pinyin_sets()
prelearn = build_prelearn()
contained = build_contained tone_index, pinyin_index
tables =
pinyin: pinyin
contained: contained
pinyin_by_count: pinyin_by_count
prelearn: prelearn
[content, nav_links] = update_character_tables_html tables
font = read_text_file "src/NotoSansSC-Light.ttf.base64"
html = read_text_file "src/character-tables-template.html"
html = replace_placeholders html, {font, content, nav_links}
#fs.writeFileSync "compiled/character-tables.html", html
for key, value of tables
tables[key] = (b.reverse() for b in value)
prelearn2 = []
for a in prelearn
for b in split_chars a[0]
prelearn2.push [b, a[1]]
write_csv_file "tmp/prelearn.csv", prelearn2
by_pinyin = get_characters_by_pinyin_rows_flat()
by_syllable = []
for a in by_pinyin
syllable = a[0].replace /[0-5]$/, ""
by_syllable.push [syllable, a[1], a[0]]
write_csv_file "data/gridlearner/characters-by-syllable.csv", by_syllable
update_characters_by_pinyin_vertical = (rows) ->
vertical_rows = format_lines_vertically rows
fs.writeFileSync "data/characters-by-pinyin-by-count-vertical.csv", vertical_rows.join "\n"
update_characters_by_pinyin = () ->
by_pinyin = {}
chars = get_all_characters_with_pinyin().filter((a) -> !a[1].endsWith("5"))
chars.forEach (a) -> object_array_add by_pinyin, a[1], a[0]
rows = Object.keys(by_pinyin).map (a) -> [a, by_pinyin[a].join("")]
rows = rows.sort (a, b) -> a[0].localeCompare(b[0]) || b[1].length - a[1].length
write_csv_file "data/characters-by-pinyin.csv", rows
rows = rows.sort (a, b) -> b[1].length - a[1].length || a[0].localeCompare(b[0])
write_csv_file "data/characters-by-pinyin-by-count.csv", rows
#rows = rows.filter (a) -> a[1].length < 4
#rows = rows.sort (b, a) -> b[1].length - a[1].length || a[0].localeCompare(b[0])
#write_csv_file "data/characters-by-pinyin-rare.csv", rows
rare_rows = []
for p in Object.keys(by_pinyin)
if by_pinyin[p].length < 3
for c in by_pinyin[p]
rare_rows.push [c, p]
rare_rows = rare_rows.sort (a, b) -> a[1].localeCompare(b[1]) || a[0].localeCompare(b[0])
write_csv_file "data/characters-pinyin-rare.csv", rare_rows
sort_by_array_with_index = (a, sorting, index) ->
a.sort (a, b) -> sorting.indexOf(a[index]) - sorting.indexOf(b[index])
index_key_value = (a, key_key, value_key) ->
b = {}
a.forEach (a) -> b[a[key_key]] = a[value_key]
b
get_compositions_index = ->
decompositions = read_csv_file "data/characters-strokes-decomposition.csv"
decompositions = ([a, c?.split("") || []] for [a, b, c] in decompositions)
compositions = {}
for a in decompositions
[char, a] = a
for component in a
c = compositions[component]
if c
unless c.includes char
c.push char
compositions[component] = c
else compositions[component] = [char]
frequency_sorter = sort_by_character_f get_character_frequency_index()
for a, b of compositions
compositions[a] = b.sort frequency_sorter
compositions
get_full_compositions_index = ->
full_decompositions = get_full_decompositions()
compositions = {}
for [char, components] in full_decompositions
for component in components
c = compositions[component]
if c
unless c.includes char
c.push char
else
compositions[component] = [char]
frequency_sorter = sort_by_character_f get_character_frequency_index()
for component, chars of compositions
compositions[component] = chars.sort frequency_sorter
compositions
get_decompositions_index = () -> index_key_value read_csv_file("data/characters-strokes-decomposition.csv"), 0, 2
get_full_decompositions = () ->
# also include decompositions of components per entry
decompositions_index = get_decompositions_index()
decompose = (a) ->
parts = decompositions_index[a]
if parts
parts = [...parts]
[a].concat(parts, parts.map(decompose))
else [a]
Object.keys(decompositions_index).map (a) ->
parts = decompose(a).flat(Infinity)
[parts[0], delete_duplicates(parts.slice(1))]
#[parts[0], parts.slice(1)]
get_full_decompositions_index = () -> index_key_value get_full_decompositions(), 0, 1
get_stroke_count_index = (a) ->
data = read_csv_file("data/characters-strokes-decomposition.csv")
result = {}
result[a[0]] = parseInt a[1] for a in data
result
get_character_reading_count_index = () ->
result = {}
read_csv_file("data/characters-pinyin-count.csv").forEach (a) -> result[a[0] + a[1]] = parseInt a[2]
result
get_character_syllables_tones_count_index = () ->
result = {}
read_csv_file("data/syllables-tones-character-counts.csv").forEach (a) -> result[a[0]] = parseInt a[1]
result
get_character_example_words_f = () ->
dictionary = dictionary_index_word_pinyin_f 0, 1
words = read_csv_file "data/words-by-frequency-with-pinyin-translation.csv"
(char, pinyin, frequency_limit) ->
char_word = words.find((b) -> b[0] is char)
unless char_word
char_word = dictionary char, pinyin
char_word = char_word[0] if char_word
char_words = if char_word then [char_word] else []
char_words.concat words.filter (b, i) -> b[0].includes(char) && b[0] != char && (!frequency_limit || i < frequency_limit)
sort_standard_character_readings = () ->
reading_count_index = get_character_reading_count_index()
path = "data/table-of-general-standard-chinese-characters.csv"
rows = read_csv_file(path).map (a) ->
char = a[0]
pinyin = a[1].split(", ").map (a) -> if a.match(/[0-5]$/) then a else a + "5"
pinyin = pinyin.sort (a, b) -> (reading_count_index[char + b] || 0) - (reading_count_index[char + a] || 0)
a[1] = pinyin.join ", "
a
write_csv_file path, rows
add_sort_field = (rows) ->
a.push i for a, i in rows
rows
update_pinyin_learning = () ->
# pinyin, word_choices -> word, translation
options =
words_per_char: 3
word_choices: 5
character_frequency_index = get_character_frequency_index()
get_character_example_words = get_character_example_words_f()
standard_chars = read_csv_file("data/table-of-general-standard-chinese-characters.csv")
chars = standard_chars.map (a) -> [a[0], a[1].split(", ")[0]]
chars = sort_by_character_frequency character_frequency_index, 0, chars
rows = for a in chars
a = get_character_example_words(a[0], a[1])
if 1 < a.length then a = a.slice 1, options.words_per_char + 1
[b[1], b[0], b[2]] for b in a
rows = rows.flat 1
rows = array_deduplicate_key rows, (a) -> a[1]
add_word_choices = (rows) ->
rows.map (a) ->
tries = 30
alternatives = [a[1]]
while tries && alternatives.length < options.word_choices
alternative = random_element rows
if a[1].length == alternative[1].length && a[0] != alternative[0] && !alternatives.includes(alternative[1])
alternatives.push alternative[1]
tries -= 1
a.push array_shuffle(alternatives).join(" ")
a
rows = add_sort_field add_word_choices rows
write_csv_file "data/pinyin-learning.csv", rows
get_char_pinyin = do ->
all_chars_and_pinyin = get_all_characters_with_pinyin()
char_pinyin_index = index_key_value all_chars_and_pinyin, 0, 1
dictionary = dictionary_index_word_f 0
(a) ->
b = dictionary a
return b[0][1] if b && b.length
b = char_pinyin_index[a]
return b if b
get_char_decompositions = do ->
decompositions = get_full_decompositions_index()
strokes = get_stroke_count_index()
(a) ->
b = decompositions[a]
return [] unless b
b = b.filter((a) -> !strokes[a] || strokes[a] > 1)
b.map((a) -> [a, get_char_pinyin(a)]).filter (a) -> a[1]
characters_add_learning_data = (rows) -> # [[character, pinyin], ...] -> [array, ...]
reading_count_index = get_character_reading_count_index()
character_by_reading_index = get_character_by_reading_index()
get_character_example_words = get_character_example_words_f()
rows = array_deduplicate_key(rows, (a) -> a[0])
syllables = delete_duplicates rows.map((a) -> a[1].split(", ")).flat()
add_same_reading_characters = (rows) ->
max_same_reading_characters = 24
rows.map (a) ->
b = (character_by_reading_index[a[1]] or []).slice(0, max_same_reading_characters)
b = b.filter (b) -> a[0] != b
a.push b.join ""
a
add_syllable_arrows = (rows) ->
rows.map (a) ->
arrow = get_syllable_circle_arrow a[1]
a.push arrow
a
add_contained_characters = (rows) ->
rows.map (a) ->
b = get_char_decompositions a[0]
c = b.map((c) -> c.join(" ")).join(", ")
a.push c
a
add_example_words = (rows) ->
rows.map (a) ->
words = get_character_example_words(a[0], a[1])
a.push(words.slice(1, 5).map((b) -> b[0]).join(" "))
a.push(words.slice(0, 5).map((b) -> b.join(" ")).join("\n"))
a
rows = add_contained_characters rows
rows = add_same_reading_characters(rows)
rows = add_sort_field rows
rows = add_syllable_arrows rows
rows = add_example_words rows
rows
fix_dependency_order = (items, char_key) ->
di = get_full_decompositions_index()
pm = {}
for i, a of items
pm[a[char_key]] = i
i = 0
while i < items.length
c = items[i][char_key]
deps = di[c] or []
for d in deps
j = pm[d]
if j? and j > i
dep = items.splice(j, 1)[0]
items.splice(i, 0, dep)
for k in [Math.min(i, j)..Math.max(i, j)]
pm[items[k][char_key]] = k
# stay at same i to recheck moved-in deps
i -= 1
break
i += 1
items
# test examples: 刀 < 那
sort_by_frequency_f = (char_key) ->
fi = get_character_frequency_index()
(a, b) -> fi[a[char_key]] - fi[b[char_key]]
sort_by_frequency = (data, char_key) -> data.sort sort_by_frequency_f char_key
sort_by_frequency_and_dependency = (data, char_key) ->
data = data.sort sort_by_frequency_f char_key
data = fix_dependency_order data, char_key
data
update_characters_learning = ->
rows = get_all_standard_characters_with_pinyin()
rows = sort_by_frequency_and_dependency rows, 0
rows = characters_add_learning_data rows
write_csv_file "data/characters-learning.csv", rows
rows = ([i + 1, a[0], a[1], a[5], a[3]] for a, i in rows)
write_csv_file "data/characters-learning-reduced.csv", rows
update_syllables_character_count = () ->
# number of characters with the same reading
chars = read_csv_file("data/characters-by-pinyin.csv").map (a) -> [a[0], a[1].length]
chars_without_tones = chars.map (a) -> [a[0].replace(/[0-5]/g, ""), a[1]]
get_data = (chars) ->
counts = {}
chars.forEach (a) ->
if counts[a[0]] then counts[a[0]] += a[1]
else counts[a[0]] = a[1]
chars = chars.map (a) -> a[0]
chars = delete_duplicates_stable chars
chars.map((a) -> [a, counts[a]]).sort (a, b) -> b[1] - a[1]
write_csv_file "data/syllables-tones-character-counts.csv", get_data(chars)
write_csv_file "data/syllables-character-counts.csv", get_data(chars_without_tones)
grade_text_files = (paths) ->
paths.forEach (a) -> console.log grade_text(read_text_file(a)) + " " + node_path.basename(a)
grade_text = (a) ->
chars = delete_duplicates a.match hanzi_regexp
frequency_index = get_character_frequency_index()
all_chars_count = Object.keys(frequency_index).length
frequencies = chars.map((a) -> frequency_index[a] || all_chars_count).sort((a, b) -> a - b)
count_score = chars.length / all_chars_count
rarity_score = median(frequencies.splice(-10)) / all_chars_count
Math.max 1, Math.round(10 * (count_score + rarity_score))
character_exclusions_gridlearner = "灬罒彳𠂉⺈辶卝埶冃丏卝宀冖亠䒑丅丷一亅⿻㇀乚丨丿⿰�丶㇒㇏⿹乛㇓㇈⿸乀㇍⿺㇋㇂㇊丆⺊ユ⿶⿵⿴⿲コ凵⿳㇌⿷囗㇎㇅㇄厸䶹乛㇓㇈㇅㇄㇈一亅㇀ 乚丨丿丶㇒㇏㇇乛㇓乀㇍㇂㇊丆二⺊卜十冂ユコ㇄㇅㇎㇌乜㇋厸丫䶹凵囗乁"
character_exclusions = "⿱丅丷一亅⿻㇀乚丨丿⿰�丶㇒㇏⿹乛㇓㇈⿸乀㇍⿺㇋㇂㇊丆⺊ユ⿶⿵⿴⿲コ凵⿳㇌⿷囗㇎㇅㇄厸䶹乛㇓㇈㇅㇄㇈一亅㇀ 乚丨丿丶㇒㇏㇇乛㇓乀㇍㇂㇊丆二⺊卜十冂ユコ㇄㇅㇎㇌乜㇋厸丫䶹凵囗乁"
get_characters_contained_pinyin_rows = (exclusions = []) ->
pinyin_index = get_character_pinyin_index()
compositions_index = get_full_compositions_index()
edges = []
has_parent = new Set()
for parent_char of compositions_index
continue unless parent_char.match hanzi_regexp
continue if exclusions.includes parent_char
continue unless pinyin_index[parent_char]
for child_char in compositions_index[parent_char] when child_char.match hanzi_regexp
continue unless pinyin_index[child_char]
edges.push [parent_char, child_char, pinyin_index[child_char]]
has_parent.add child_char
for parent_char of compositions_index when not has_parent.has parent_char
continue unless parent_char.match hanzi_regexp
continue if exclusions.includes parent_char
continue unless pinyin_index[parent_char]
edges.push [null, parent_char, pinyin_index[parent_char]]
edges
get_characters_contained_rows = (exclusions = character_exclusions) ->
compositions = get_compositions_index()
rows = []
for char of compositions when char.match(hanzi_regexp) and not exclusions.includes(char)
rows.push [char, compositions[char]]
rows.sort (a, b) -> a[1].length - b[1].length
update_characters_contained = ->
rows = get_characters_contained_pinyin_rows()
rows_gridlearner = get_characters_contained_pinyin_rows character_exclusions_gridlearner
for a in rows_gridlearner
continue unless a[2]
a[2] = a[2] + get_syllable_circle_arrow a[2]
write_csv_file "data/gridlearner/characters-by-component.csv", rows_gridlearner
rows = get_characters_contained_rows character_exclusions
lines = (a[0] + " " + a[1].join("") for a in rows).join "\n"
fs.writeFileSync "data/characters-contained.txt", lines
rows = (a[0] + " " + get_char_decompositions(a[0]).join("") for a in rows)
fs.writeFileSync "data/characters-containing.txt", rows.join "\n"
update_characters_data = ->
graphics_data = JSON.parse read_text_file "data/characters-svg-animcjk-simple.json"
character_data = read_csv_file "data/characters-strokes-decomposition.csv"
compositions_index = get_compositions_index()
dictionary_lookup = dictionary_index_word_f 0
character_frequency_index = get_character_frequency_index()
result = []
for a, i in character_data
[char, strokes, decomposition] = a
strokes = parseInt strokes, 10
svg_paths = graphics_data[char] || ""
compositions = compositions_index[char] || []
entries = dictionary_lookup char
if entries and entries.length
entry = entries[0]
pinyin = entry[1]
else pinyin = ""
result.push [char, strokes, pinyin, decomposition || "", compositions.join(""), svg_paths]
result = sort_by_character_frequency character_frequency_index, 0, result
fs.writeFileSync "data/characters-svg.json", JSON.stringify result
get_common_words_per_character = (max_words_per_char, max_frequency) ->
character_frequency_index = get_character_frequency_index()
get_character_example_words = get_character_example_words_f()
standard_chars = read_csv_file "data/table-of-general-standard-chinese-characters.csv"
chars = standard_chars.map (a) -> [a[0], a[1].split(", ")[0]]
chars = sort_by_character_frequency character_frequency_index, 0, chars
rows = for a in chars
a = get_character_example_words a[0], a[1], max_frequency
if 1 < a.length then a = a.slice 0, max_words_per_char
a
rows = rows.flat 1
rows = array_deduplicate_key rows, (a) -> a[1]
is_file = (path) -> fs.statSync(path).isFile()
strip_extensions = (filename) -> filename.replace /\.[^.]+$/, ''
update_lists = (paths) ->
nav_links = []
paths = (a for a in paths when is_file a)
content = for path, i in paths
rows = read_csv_file path
parts = for row in rows
[head, tail...] = row
tail = tail.join " "
"""
#{head}#{tail}
"""
label = strip_extensions node_path.basename path
nav_links.push """
#{label}
"""
"" + parts.join("\n") + "
"
content = content.join "\n"
nav_links = nav_links.join "\n"
font = read_text_file "src/NotoSansSC-Light.ttf.base64"
html = read_text_file "src/lists-template.html"
html = replace_placeholders html, {font, content, nav_links}
fs.writeFileSync "tmp/lists.html", html
iconv = require "iconv-lite"
update_character_frequency = ->
buf = fs.readFileSync "/tmp/SUBTLEX-CH-CHR"
text = iconv.decode buf, "gb2312"
lines = text.split "\n"
chars = []
for line in lines when line.trim() and not line.startsWith("Character") and not line.startsWith("Total")
parts = line.trim().split /\s+/
chr = parts[0]
if chr.length is 1
chars.push chr
fs.writeFileSync "data/characters-by-frequency.txt", chars.join ""
update_word_frequency = ->
buf = fs.readFileSync "/tmp/SUBTLEX-CH-WF"
text = iconv.decode buf, "gb2312"
lines = text.split "\n"
words = []
for line in lines when line.trim() and not line.startsWith("Word")
parts = line.trim().split /\s+/
word = parts[0]
continue unless word.match /[\u4e00-\u9fff]/ # skip PUA and non-CJK
words.push word
fs.writeFileSync "data/words-by-frequency.txt", words.join "\n"
update_word_frequency_pinyin = ->
words = array_from_newline_file "data/words-by-frequency.txt"
dict = dictionary_index_word_f 0
result = for word in words
entry = dict word
continue unless entry
pinyin = entry[0][1]
[word, pinyin]
write_csv_file "data/words-by-frequency-with-pinyin.csv", result
get_practice_words = (num_attempts, max_freq) ->
# get a list of the most frequent words where each character ideally appears
# only once and no word appears twice.
word_frequency_index = get_word_frequency_index()
characters = get_all_standard_characters()
rows = read_csv_file "data/words-by-frequency-with-pinyin.csv"
rows = rows.filter (a)->
chars = split_chars a[0]
chars.length > 1 && chars[0] != chars[1]
candidate_words = {}
for [w, p] in rows
freq = word_frequency_index[w] || max_freq + 1
continue if freq > max_freq
for ch in split_chars w
continue unless ch in characters
(candidate_words[ch] ?= []).push [w,p,freq]
characters = characters.filter (ch)-> candidate_words[ch]?
for ch in characters
candidate_words[ch].sort (a,b)-> a[2] - b[2]
best_total_cost = Infinity
best_assign = null
for attempt in [0...num_attempts]
order = array_shuffle characters.slice()
counts = {}
used_words = {}
assign = {}
run_cost = 0
for ch in order
opts = candidate_words[ch]
best_score = Infinity
chosen = null
for [w,p,freq] in opts when not used_words[w]
score = sum(counts[c] || 0 for c in w) + freq
if score < best_score or (score is best_score and Math.random() < 0.5)
best_score = score
chosen = [w,p,freq]
continue unless chosen?
assign[ch] = chosen
used_words[chosen[0]] = true
counts[c] = (counts[c] || 0) + 1 for c in chosen[0]
run_cost += best_score
if run_cost < best_total_cost
best_total_cost = run_cost
best_assign = assign
words = ([x[0],x[1]] for ch,x of best_assign)
sort_by_word_frequency word_frequency_index, 0, words
update_practice_words = ->
rows = get_practice_words 1000, Infinity
write_csv_file "data/practice-words.csv", rows
update_gridlearner_data = ->
chars = get_all_characters_with_pinyin()
batch_size = 300
get_batch_index = (i) -> (1 + i / batch_size).toString().padStart 2, "0"
for i in [0...chars.length] by batch_size
data = ([a[0], a[1]] for a in chars[i...i + batch_size])
ii = get_batch_index i
write_csv_file "data/gridlearner/characters-pinyin-#{ii}.dsv", data
update_characters_series = ->
rows = read_csv_file "data/gridlearner/characters-by-component.csv"
graph = {}
for [p,c] in rows
object_array_add graph, p, c
max_start_degree = 30
memo = {}
longest = (n) ->
return memo[n] if memo[n]?
kids = graph[n] or []
return memo[n] = [[n]] unless kids.length
memo[n] = ( [n].concat longest(k).reduce ((a,b)-> if b.length>a.length then b else a) ) for k in kids
nodes = delete_duplicates_stable (rows.map((r)->r[0]).concat rows.map((r)->r[1]))
chains = []
for n in nodes when (graph[n]?.length||0) and graph[n].length <= max_start_degree
chains = chains.concat longest n
seen = new Set()
uniq = []
for ch in chains when ch.length > 2
id = ch.join ""
continue if seen.has id
uniq.push ch
seen.add id
sub = (a,b)-> b.join("").includes a.join("")
uniq = uniq.filter (c)-> not uniq.some (d)-> d isnt c and d.length>c.length and sub c,d
uniq = uniq.sort (a,b)-> b.length - a.length
fs.writeFileSync "data/characters-series.txt", uniq.map((c)->c.join "").join "\n"
similar_initial = (s1, s2) ->
pairs =
c: "z", z: "c",
j: "q", q: "j",
k: "g", g: "k"
# quick-n-dirty initial extractor good enough for the pairs above
initial = (s) ->
if s.startsWith("zh") or s.startsWith("ch") or s.startsWith("sh")
s.slice 0, 2
else s[0]
i1 = initial s1
i2 = initial s2
r1 = s1.slice i1.length
r2 = s2.slice i2.length
(pairs[i1] is i2) and (r1 is r2)
update_characters_links = ->
pinyin_index = get_character_pinyin_index() # {char → "xx4"}
tone_index = get_character_tone_index() # {char → 4}
rows = read_csv_file "data/gridlearner/characters-by-component.csv"
by_component = {}
rows.forEach ([component, carrier]) ->
return unless component and carrier
object_array_add by_component, component, carrier
output_rows = []
for comp_char, carriers of by_component
base_py = pinyin_index[comp_char]
continue unless base_py # skip if the component itself lacks a reading
base_py = base_py.split(",")[0]
base_syl = base_py.replace /[0-5]$/, ""
base_tone = parseInt base_py.slice(-1), 10
tone_syll = []
tone_only = []
syl_only = []
init_links = []
carriers.forEach (c) ->
return if c is comp_char
cp = pinyin_index[c]
return unless cp
cp = cp.split(",")[0]
c_syl = cp.replace /[0-5]$/, ""
c_tone = parseInt cp.slice(-1), 10
if cp is base_py then tone_syll.push c
else if c_tone is base_tone then tone_only.push c
else if c_syl is base_syl then syl_only.push c
else if similar_initial base_syl, c_syl then init_links.push c
dedup = delete_duplicates_stable
[tone_syll, tone_only, syl_only, init_links] =
(dedup lst for lst in [tone_syll, tone_only, syl_only, init_links])
if tone_syll.length or tone_only.length or syl_only.length or init_links.length
output_rows.push [
comp_char,
tone_syll.join(""),
syl_only.join(""),
tone_only.join(""),
init_links.join("")
]
write_csv_file "data/character-links.csv", output_rows
run = ->
update_characters_series()
#update_characters_links()
#find_longest_containment_chains()
#collect_characters_by_syllable_containment()
module.exports = {
read_text_file
get_characters_by_pinyin_rows
clean_frequency_list
replace_placeholders
object_array_add
get_all_characters_with_pinyin
update_dictionary
update_characters_data
traditional_to_simplified
pinyin_to_hanzi
hanzi_to_pinyin
mark_to_number
update_characters_by_pinyin
update_characters_learning
update_pinyin_learning
grade_text
grade_text_files
run
update_lists
}