character_exclusions_gridlearner = "灬罒彳𠂉⺈辶卝埶冃丏卝宀冖亠䒑丅丷一亅⿻㇀乚丨丿⿰�丶㇒㇏⿹乛㇓㇈⿸乀㇍⿺㇋㇂㇊丆⺊ユ⿶⿵⿴⿲コ凵⿳㇌⿷囗㇎㇅㇄厸䶹乛㇓㇈㇅㇄㇈一亅㇀ 乚丨丿丶㇒㇏㇇乛㇓乀㇍㇂㇊丆二⺊卜十冂ユコ㇄㇅㇎㇌乜㇋厸丫䶹凵囗乁" character_exclusions = "⿱丅丷一亅⿻㇀乚丨丿⿰�丶㇒㇏⿹乛㇓㇈⿸乀㇍⿺㇋㇂㇊丆⺊ユ⿶⿵⿴⿲コ凵⿳㇌⿷囗㇎㇅㇄厸䶹乛㇓㇈㇅㇄㇈一亅㇀ 乚丨丿丶㇒㇏㇇乛㇓乀㇍㇂㇊丆二⺊卜十冂ユコ㇄㇅㇎㇌乜㇋厸丫䶹凵囗乁" h = require "./helper" fs = require "fs" node_path = require "path" iconv = require "iconv-lite" coffee = require "coffeescript" lookup = require "./lookup" update_all_characters_with_pinyin = -> primary_pinyin_f = lookup.make_primary_pinyin_f() order_f = lookup.make_char_freq_dep_index_from_file_f() if Object.keys(order_f.index_map).length is 0 then order_f = lookup.make_char_freq_dep_index_f() character_set = new Set() h.read_csv_file("data/table-of-general-standard-chinese-characters.csv").forEach (row) -> character_set.add row[0] h.read_csv_file("data/additional-characters.csv").forEach (row) -> unless character_exclusions.includes row[0] then character_set.add row[0] h.read_csv_file("data/characters-strokes-decomposition.csv").forEach (row) -> character_set.add row[0] return unless row.length >= 3 h.split_chars(row[2]).forEach (c) -> if c.match h.hanzi_regexp then character_set.add c pairs = Array.from(character_set).map (c) -> [c, primary_pinyin_f c] pairs.sort (a, b) -> (order_f.index_map[a[0]] ? 9e15) - (order_f.index_map[b[0]] ? 9e15) h.write_csv_file "data/characters-pinyin-by-frequency-dependency.csv", pairs get_all_characters_with_pinyin = -> h.read_csv_file("data/characters-pinyin-by-frequency-dependency.csv") get_characters_by_pinyin_rows = -> primary_pinyin_f = lookup.make_primary_pinyin_f() groups = {} get_all_characters().forEach (c) -> p = primary_pinyin_f c return unless p? and not p.endsWith "5" (groups[p] ?= []).push c rows = Object.keys(groups).map (p) -> [p, groups[p]] rows.sort (a, b) -> a[0].localeCompare(b[0]) || b[1].length - a[1].length get_char_decompositions = (c) -> f = lookup.make_char_decompositions_f lookup.make_primary_pinyin_f() f c sort_by_frequency_and_dependency = (rows, char_key) -> order_f = lookup.make_char_freq_dep_index_from_file_f() if Object.keys(order_f.index_map).length is 0 then order_f = lookup.make_char_freq_dep_index_f() rows.sort (a, b) -> (order_f.index_map[a[char_key]] ? 9e15) - (order_f.index_map[b[char_key]] ? 9e15) dictionary_cedict_to_json = (rows) -> JSON.stringify rows.map (r) -> r[2] = r[2].split "/" r.push r[1].replace /[0-4]/g, "" r update_character_frequency = -> buf = fs.readFileSync "/tmp/subtlex-ch-chr" text = iconv.decode buf, "gb2312" lines = text.split "\n" out = [] for line in lines when line.trim() and not line.startsWith("Character") and not line.startsWith("Total") parts = line.trim().split /\s+/ c = parts[0] if c.length is 1 then out.push c fs.writeFileSync "data/subtlex-characters-by-frequency.txt", out.join "\n" update_characters_by_frequency_dependency = -> order_f = lookup.make_char_freq_dep_index_f() ordered = Object.keys(order_f.index_map).sort (a, b) -> order_f.index_map[a] - order_f.index_map[b] rows = ordered.map (c, i) -> [c] h.write_csv_file "data/characters-by-frequency-dependency.csv", rows update_word_frequency_pinyin = -> char_freq_f = lookup.make_char_freq_index_f() dict_f = lookup.make_dictionary_index_word_f 0 words = h.array_from_newline_file "data/subtlex-words-by-frequency.txt" add = (r[0] for r in h.read_csv_file "data/cedict.csv") seen = new Set words add = (w for w in add when not seen.has w) cap = Object.keys(char_freq_f.index_map).length add.sort (a, b) -> sa = 0 sb = 0 sa += (char_freq_f.index_map[c] or cap for c in h.split_chars a) sb += (char_freq_f.index_map[c] or cap for c in h.split_chars b) sa - sb words = words.concat add result = for w in words e = dict_f w continue unless e [w, e[0][1], e[0][2]] h.write_csv_file "data/words-by-frequency-with-pinyin-translation.csv", result h.write_csv_file "data/words-by-frequency-with-pinyin.csv", ([r[0], r[1]] for r in result) update_characters_by_pinyin = -> rows = get_characters_by_pinyin_rows() joined = rows.map (r) -> [r[0], r[1].join ""] a = joined.sort (x, y) -> x[0].localeCompare(y[0]) || y[1].length - x[1].length h.write_csv_file "data/characters-by-pinyin.csv", a b = joined.slice().sort (x, y) -> y[1].length - x[1].length || x[0].localeCompare(y[0]) h.write_csv_file "data/characters-by-pinyin-by-count.csv", b rare = [] rows.forEach (r) -> if r[1].length < 3 then r[1].forEach (c) -> rare.push [c, r[0]] rare = rare.sort (x, y) -> x[1].localeCompare(y[1]) || x[0].localeCompare(y[0]) h.write_csv_file "data/characters-pinyin-rare.csv", rare update_characters_data = -> primary_pinyin_f = lookup.make_primary_pinyin_f() char_freq_f = lookup.make_char_freq_index_f() contained_by_f = lookup.make_contained_by_map_f() graphics = JSON.parse h.read_text_file "data/characters-svg-animcjk-simple.json" rows = h.read_csv_file "data/characters-strokes-decomposition.csv" contain_sorted = {} Object.keys(contained_by_f.index_map).forEach (k) -> v = contained_by_f.index_map[k] contain_sorted[k] = v.slice().sort (a, b) -> (char_freq_f.index_map[a] ? 9e15) - (char_freq_f.index_map[b] ? 9e15) or a.localeCompare b out = [] for r in rows c = r[0] s = parseInt r[1],10 d = r[2] or "" svg = graphics[c] or "" comps = contain_sorted[c] or [] p = primary_pinyin_f c out.push [c, s, p, d, comps.join(""), svg] out = out.sort (x, y) -> (char_freq_f.index_map[x[0]] ? 9e15) - (char_freq_f.index_map[y[0]] ? 9e15) fs.writeFileSync "data/characters-svg.json", JSON.stringify out characters_add_learning_data = (rows, allowed_chars=null) -> char_by_reading_f = lookup.make_char_by_reading_index_f() primary_pinyin_f = lookup.make_primary_pinyin_f() char_decompositions_f = lookup.make_char_decompositions_f primary_pinyin_f contained_by_f = lookup.make_contained_by_map_f() rows = h.array_deduplicate_key rows, (r) -> r[0] max_same = 16 max_containing = 5 in_scope = (c) -> (not allowed_chars?) or allowed_chars.has c add_same_reading = (rows) -> rows.map (r) -> cs = (char_by_reading_f.index_map[r[1]] or []).filter(in_scope).slice 0, max_same cs = cs.filter (c) -> c isnt r[0] r.push cs.join "" r add_contained = (rows) -> rows.map (r) -> comps = (char_decompositions_f r[0]).map (x) -> x[0] comps = comps.filter(in_scope) formatted = comps.map((c) -> p = primary_pinyin_f c; if p then "#{c} #{p}" else null).filter Boolean r.push formatted.join ", " r add_containing = (rows) -> rows.map (r) -> carriers = (contained_by_f r[0]).filter(in_scope).slice 0, max_containing formatted = carriers.map((c) -> p = primary_pinyin_f c; if p then "#{c} #{p}" else null).filter Boolean r.push formatted.join ", " r add_examples = (rows) -> top_examples_f = lookup.make_top_examples_f() rows.map (r) -> words = top_examples_f r[0], 4 r.push words.map((w) -> w.join " ").join "\n" r add_reading_classification = (rows) -> rows.map (r) -> cs = (char_by_reading_f.index_map[r[1]] or []).filter(in_scope) label = if cs.length is 1 then "unique" else if cs.length <= 3 then "rare" else "" r.push label r add_sort_field = (rows) -> a.push i for a, i in rows rows rows = add_contained rows rows = add_containing rows rows = add_same_reading rows rows = add_sort_field rows rows = add_examples rows rows = add_reading_classification rows rows update_characters_learning = -> all_rows = get_all_characters_with_pinyin() all_rows = sort_by_frequency_and_dependency all_rows, 0 mid = Math.ceil all_rows.length / 2 first = all_rows.slice 0, mid second = all_rows.slice mid first_set = new Set first.map (r) -> r[0] first_out = characters_add_learning_data first, first_set second_out = characters_add_learning_data second write_set = (rows, suffix) -> base = "data/characters-learning" h.write_csv_file "#{base}#{suffix}.csv", rows reduced = ([i + 1, r[0], r[1], r[5], r[3]] for r, i in rows) h.write_csv_file "#{base}-reduced#{suffix}.csv", reduced write_set first_out, "" write_set second_out, "-extended" update_lists = (paths) -> nav_links = [] paths = (p for p in paths when h.is_file p) content = for path, i in paths rows = h.read_csv_file path parts = for r in rows [head, tail...] = r tail = tail.join " " "#{head}#{tail}" label = h.strip_extensions node_path.basename path nav_links.push "#{label}" "