# main.coffee get_word_frequency_index = () -> frequency = h.array_from_newline_file "data/words-by-frequency.txt" frequency_index = {} frequency.forEach (a, i) -> a = a.replace " ", "" frequency_index[a] = i unless frequency_index[a] frequency_index get_word_frequency_index_with_pinyin = () -> frequency = h.array_from_newline_file "data/words-by-frequency-with-pinyin.csv" frequency_index = {} frequency.forEach (a, i) -> a = a.replace " ", "" frequency_index[a] = i unless frequency_index[a] frequency_index get_all_standard_characters = () -> h.read_csv_file("data/table-of-general-standard-chinese-characters.csv").map (a) -> a[0] get_all_characters = () -> h.read_csv_file("data/characters-strokes-decomposition.csv").map (a) -> a[0] display_all_characters = () -> console.log get_all_characters().join("") get_character_pinyin_frequency_index = () -> result = {} index = 0 chars = get_frequency_characters_and_pinyin() chars.forEach (a) -> key = a[0] + (a[1] || "") unless result[key] result[key] = index index += 1 result update_character_reading_count = () -> index = {} rows = [] chars = get_all_characters() chars_and_pinyin = get_frequency_characters_and_pinyin() chars.forEach (a) -> chars_and_pinyin.forEach (b) -> if a[0] is b[0] key = a[0] + b[1] if index[key] != undefined then index[key] += 1 else index[key] = 0 Object.keys(index).forEach (a) -> count = index[a] if count then rows.push [a[0], a.slice(1), count] rows = rows.sort (a, b) -> b[2] - a[2] h.write_csv_file "data/characters-pinyin-count.csv", rows sort_by_stroke_count = (stroke_count_index, character_key, data) -> data.sort sort_by_index_and_character_f stroke_count_index, character_key sort_by_word_frequency_with_pinyin = (frequency_index, word_key, pinyin_key, data) -> data.sort (a, b) -> fa = frequency_index[a[word_key] + a[pinyin_key]] fb = frequency_index[b[word_key] + b[pinyin_key]] if fa is undefined and fb is undefined a[word_key].length - b[word_key].length else if fa is undefined 1 else if fb is undefined -1 else fa - fb class_for_tone = (tone) -> "tone#{tone}" get_characters_by_pinyin_rows_flat = -> result = [] for a in get_characters_by_pinyin_rows() for b in a[1] result.push [a[0], b] result update_character_tables = -> tone_index = get_character_tone_index() pinyin_index = get_character_pinyin_index() [pinyin, pinyin_by_count] = build_pinyin_sets() prelearn = build_prelearn() contained = build_contained tone_index, pinyin_index tables = pinyin: pinyin contained: contained pinyin_by_count: pinyin_by_count prelearn: prelearn [content, nav_links] = update_character_tables_html tables font = h.read_text_file "src/NotoSansSC-Light.ttf.base64" html = h.read_text_file "src/character-tables-template.html" html = h.replace_placeholders html, {font, content, nav_links} for key, value of tables tables[key] = (b.reverse() for b in value) prelearn2 = [] for a in prelearn for b in h.split_chars a[0] prelearn2.push [b, a[1]] h.write_csv_file "tmp/prelearn.csv", prelearn2 update_characters_by_pinyin_vertical = (rows) -> vertical_rows = format_lines_vertically rows fs.writeFileSync "data/characters-by-pinyin-by-count-vertical.csv", vertical_rows.join "\n" sort_by_array_with_index = (a, sorting, index) -> a.sort (a, b) -> sorting.indexOf(a[index]) - sorting.indexOf(b[index]) get_character_syllables_tones_count_index = () -> result = {} h.read_csv_file("data/syllables-tones-character-counts.csv").forEach (a) -> result[a[0]] = parseInt a[1] result sort_standard_character_readings = () -> reading_count_index = get_character_reading_count_index() path = "data/table-of-general-standard-chinese-characters.csv" rows = h.read_csv_file(path).map (a) -> char = a[0] pinyin = a[1].split(", ").map (a) -> if a.match(/[0-5]$/) then a else a + "5" pinyin = pinyin.sort (a, b) -> (reading_countindex[char + b] || 0) - (reading_count_index[char + a] || 0) a[1] = pinyin.join ", " a h.write_csv_file path, rows sort_by_frequency = (data, char_key) -> data.sort sort_by_frequency_f char_key update_syllables_character_count = () -> chars = h.read_csv_file("data/characters-by-pinyin.csv").map (a) -> [a[0], a[1].length] chars_without_tones = chars.map (a) -> [a[0].replace(/[0-5]/g, ""), a[1]] get_data = (chars) -> counts = {} chars.forEach (a) -> if counts[a[0]] then counts[a[0]] += a[1] else counts[a[0]] = a[1] chars = chars.map (a) -> a[0] chars = h.delete_duplicates_stable chars chars.map((a) -> [a, counts[a]]).sort (a, b) -> b[1] - a[1] h.write_csv_file "data/syllables-tones-character-counts.csv", get_data(chars) h.write_csv_file "data/syllables-character-counts.csv", get_data(chars_without_tones) get_characters_contained_pinyin_rows = (exclusions = []) -> pinyin_index = get_character_pinyin_index() compositions_index = get_full_compositions_index() edges = [] has_parent = new Set() for parent_char of compositions_index continue unless parent_char.match h.hanzi_regexp continue if exclusions.includes parent_char continue unless pinyin_index[parent_char] for child_char in compositions_index[parent_char] when child_char.match h.hanzi_regexp continue unless pinyin_index[child_char] edges.push [parent_char, child_char, pinyin_index[child_char]] has_parent.add child_char for parent_char of compositions_index when not has_parent.has parent_char continue unless parent_char.match h.hanzi_regexp continue if exclusions.includes parent_char continue unless pinyin_index[parent_char] edges.push [null, parent_char, pinyin_index[parent_char]] edges get_characters_contained_rows = (exclusions = character_exclusions) -> compositions = get_compositions_index() rows = [] for char of compositions when char.match(h.hanzi_regexp) and not exclusions.includes(char) rows.push [char, compositions[char]] rows.sort (a, b) -> a[1].length - b[1].length update_characters_contained = -> rows = get_characters_contained_pinyin_rows() rows_gridlearner = get_characters_contained_pinyin_rows character_exclusions_gridlearner h.write_csv_file "data/gridlearner/characters-by-component.csv", rows_gridlearner rows = get_characters_contained_rows character_exclusions lines = (a[0] + " " + a[1].join("") for a in rows).join "\n" fs.writeFileSync "data/characters-contained.txt", lines rows = (a[0] + " " + get_char_decompositions(a[0]).join("") for a in rows) fs.writeFileSync "data/characters-containing.txt", rows.join "\n" get_common_words_per_character = (max_words_per_char, max_frequency) -> character_frequency_index = get_character_frequency_index() get_character_example_words = get_character_example_words_f() standard_chars = h.read_csv_file "data/table-of-general-standard-chinese-characters.csv" chars = standard_chars.map (a) -> [a[0], a[1].split(", ")[0]] chars = sort_by_character_frequency character_frequency_index, 0, chars rows = for a in chars a = get_character_example_words a[0], a[1], max_frequency if 1 < a.length then a = a.slice 0, max_words_per_char a rows = rows.flat 1 rows = h.array_deduplicate_key rows, (a) -> a[1] update_word_frequency = -> buf = fs.readFileSync "/tmp/SUBTLEX-CH-WF" text = iconv.decode buf, "gb2312" lines = text.split "\n" words = [] for line in lines when line.trim() and not line.startsWith("Word") parts = line.trim().split /\s+/ word = parts[0] continue unless word.match /[\u4e00-\u9fff]/ words.push word fs.writeFileSync "data/subtlex-words-by-frequency.txt", words.join "\n" get_practice_words = (num_attempts, max_freq) -> word_frequency_index = get_word_frequency_index() characters = get_all_standard_characters() rows = h.read_csv_file "data/words-by-frequency-with-pinyin.csv" rows = rows.filter (a)-> chars = h.split_chars a[0] chars.length > 1 && chars[0] != chars[1] candidate_words = {} for [w, p] in rows freq = word_frequency_index[w] || max_freq + 1 continue if freq > max_freq for ch in h.split_chars w continue unless ch in characters (candidate_words[ch] ?= []).push [w,p,freq] characters = characters.filter (ch)-> candidate_words[ch]? for ch in characters candidate_words[ch].sort (a,b)-> a[2] - b[2] best_total_cost = Infinity best_assign = null for attempt in [0...num_attempts] order = h.array_shuffle characters.slice() counts = {} used_words = {} assign = {} run_cost = 0 for ch in order opts = candidate_words[ch] best_score = Infinity chosen = null for [w,p,freq] in opts when not used_words[w] score = h.sum(counts[c] || 0 for c in w) + freq if score < best_score or (score is best_score and Math.random() < 0.5) best_score = score chosen = [w,p,freq] continue unless chosen? assign[ch] = chosen used_words[chosen[0]] = true counts[c] = (counts[c] || 0) + 1 for c in chosen[0] run_cost += best_score if run_cost < best_total_cost best_total_cost = run_cost best_assign = assign words = ([x[0],x[1]] for ch,x of best_assign) sort_by_word_frequency word_frequency_index, 0, words update_practice_words = -> rows = get_practice_words 1000, Infinity h.write_csv_file "data/practice-words.csv", rows update_gridlearner_characters_by_pinyin = -> chars = get_all_characters_with_pinyin() batch_size = 300 get_batch_index = (i) -> (1 + i / batch_size).toString().padStart 2, "0" for i in [0...chars.length] by batch_size data = ([a[0], a[1]] for a in chars[i...i + batch_size]) ii = get_batch_index i h.write_csv_file "data/gridlearner/characters-pinyin-#{ii}.dsv", data similar_initial = (s1, s2) -> pairs = c: "z", z: "c", j: "q", q: "j", k: "g", g: "k" initial = (s) -> if s.startsWith("zh") or s.startsWith("ch") or s.startsWith("sh") s.slice 0, 2 else s[0] i1 = initial s1 i2 = initial s2 r1 = s1.slice i1.length r2 = s2.slice i2.length (pairs[i1] is i2) and (r1 is r2) update_characters_links = -> pinyin_index = get_character_pinyin_index() tone_index = get_character_tone_index() rows = h.read_csv_file "data/gridlearner/characters-by-component.csv" by_component = {} rows.forEach ([component, carrier]) -> return unless component and carrier h.object_array_add by_component, component, carrier output_rows = [] for comp_char, carriers of by_component base_py = pinyin_index[comp_char] continue unless base_py base_py = base_py.split(",")[0] base_syl = base_py.replace /[0-5]$/, "" base_tone = parseInt base_py.slice(-1), 10 tone_syll = [] tone_only = [] syl_only = [] init_links = [] carriers.forEach (c) -> return if c is comp_char cp = pinyin_index[c] return unless cp cp = cp.split(",")[0] c_syl = cp.replace /[0-5]$/, "" c_tone = parseInt cp.slice(-1), 10 if cp is base_py then tone_syll.push c else if c_tone is base_tone then tone_only.push c else if c_syl is base_syl then syl_only.push c else if similar_initial base_syl, c_syl then init_links.push c dedup = h.delete_duplicates_stable [tone_syll, tone_only, syl_only, init_links] = (dedup lst for lst in [tone_syll, tone_only, syl_only, init_links]) if tone_syll.length or tone_only.length or syl_only.length or init_links.length output_rows.push [ comp_char, tone_syll.join(""), syl_only.join(""), tone_only.join(""), init_links.join("") ] h.write_csv_file "data/character-links.csv", output_rows dsv_characters_add_pinyin = (character_index) -> pinyin_index = get_character_pinyin_index() rows = h.read_csv_file(0).map (a) -> pinyin = pinyin_index[a[character_index]] return a unless pinyin a.concat [pinyin] h.write_csv_file 1, rows update_characters_series = -> rows = h.read_csv_file "data/gridlearner/characters-by-component.csv" graph = {} for [p,c] in rows h.object_array_add graph, p, c max_start_degree = 30 memo = {} longest = (n) -> return memo[n] if memo[n]? kids = graph[n] or [] return memo[n] = [[n]] unless kids.length memo[n] = ( [n].concat longest(k).reduce ((a,b)-> if b.length>a.length then b else a) ) for k in kids nodes = h.delete_duplicates_stable (rows.map((r)->r[0]).concat rows.map((r)->r[1])) chains = [] for n in nodes when (graph[n]?.length||0) and graph[n].length <= max_start_degree chains = chains.concat longest n seen = new Set() uniq = [] for ch in chains when ch.length > 2 id = ch.join "" continue if seen.has id uniq.push ch seen.add id sub = (a,b)-> b.join("").includes a.join("") uniq = uniq.filter (c)-> not uniq.some (d)-> d isnt c and d.length>c.length and sub c,d uniq = uniq.sort (a,b)-> b.length - a.length fs.writeFileSync "data/characters-series.txt", uniq.map((c)->c.join "").join "\n" pinyin_to_hanzi = (text) -> cleaned = text.replace(h.non_pinyin_regexp, " ").trim() find_multiple_word_matches cleaned, 1, 0, h.pinyin_split2 hanzi_to_pinyin = (text) -> cleaned = text.replace(h.non_hanzi_regexp, " ").trim() find_multiple_word_matches cleaned, 0, 1, h.split_chars find_multiple_word_matches = (text, lookup_index, translation_index, split_syllables_f) -> dictionary_lookup = dictionary_index_word_f lookup_index result_tokens = [] text.split(" ").forEach (segment) -> syllables = split_syllables_f segment max_word_length = 5 slice_join = (start_index, end_index) -> syllables.slice(start_index, end_index).join("") buld_spans_from = (start_index) -> end_limit = Math.min(start_index + max_word_length, syllables.length) + 1 slice_join start_index, end_index for end_index in [(start_index + 1)...end_limit] candidate_spans_per_start = (build_spans_from start_index for start_index in [0...syllables.length]) candidate_index = 0 while candidate_index < candidate_spans_per_start.length matched_translations = [] reversed_spans = candidate_spans_per_start[candidate_index].toReversed() reversed_index = 0 while reversed_index < reversed_spans.length translations = dictionary_lookup reversed_spans[reversed_index] if translations matched_translations.push translations.map((row) -> row[translation_index]).join "/" break reversed_index += 1 if matched_translations.length result_tokens.push matched_translations[0] candidate_index += reversed_spans.length - reversed_index else result_tokens.push candidate_spans_per_start[candidate_index][0] candidate_index += 1 result_tokens.join " " mark_to_number = (text) -> text.split(" ").map((token) -> h.pinyin_split2(token).map(pinyin_utils.markToNumber).join("")).join(" ") update_pinyin_learning = () -> options = words_per_char: 3 word_choices: 5 character_frequency_index = lookup.char_freq_index() get_character_example_words = get_character_example_words_f() standard_chars = h.read_csv_file("data/table-of-general-standard-chinese-characters.csv") chars = standard_chars.map (a) -> [a[0], a[1].split(", ")[0]] chars = sort_by_character_frequency character_frequency_index, 0, chars rows = for a in chars a = get_character_example_words(a[0], a[1]) if 1 < a.length then a = a.slice 1, options.words_per_char + 1 [b[1], b[0], b[2]] for b in a rows = rows.flat 1 rows = h.array_deduplicate_key rows,(a) -> a[1] add_word_choices = (rows) -> rows.map (a) -> tries = 30 alternatives = [a[1]] while tries && alternatives.length < options.word_choices alternative = h.random_element rows if a[1].length == alternative[1].length && a[0] != alternative[0] && !alternatives.includes(alternative[1]) alternatives.push alternative[1] tries -= 1 a.push h.array_shuffle(alternatives).join(" ") a rows = add_sort_field add_word_choices rows h.write_csv_file "data/pinyin-learning.csv", rows grade_text = (a) -> chars = h.delete_duplicates a.match h.hanzi_regexp frequency_index = lookup.char_freq_index() all_chars_count = Object.keys(frequency_index).length frequencies = chars.map((a) -> frequency_index[a] || all_chars_count).sort((a, b) -> a - b) count_score = chars.length / all_chars_count rarity_score = h.median(frequencies.splice(-10)) / all_chars_count Math.max 1, Math.round(10 * (count_score + rarity_score)) grade_text_files = (paths) -> paths.forEach (a) -> console.log grade_text(h.read_text_file(a)) + " " + node_path.basename(a) clean_frequency_list = () -> frequency_array = h.array_from_newline_file "data/words-by-frequency.txt" frequency_array = frequency_array.filter (a) -> h.traditional_to_simplified h.remove_non_chinese_characters a frequency_array.forEach (a) -> console.log a