csv_parse = require "csv-parse/sync" csv_stringify = require "csv-stringify/sync" fs = require "fs" hanzi_tools = require "hanzi-tools" html_parser = require "node-html-parser" http = require "https" path = require "path" pinyin_split = require "pinyin-split" pinyin_utils = require "pinyin-utils" #scraper = require "table-scraper" read_csv_file = (path, delimiter) -> csv_parse.parse fs.readFileSync(path, "utf-8"), {delimiter: delimiter || " ", relax_column_count: true} array_from_newline_file = (path) -> fs.readFileSync(path).toString().trim().split("\n") on_error = (a) -> if a then console.error a delete_duplicates = (a) -> [...new Set(a)] split_chars = (a) -> [...a] random_integer = (min, max) -> Math.floor(Math.random() * (max - min + 1)) + min random_element = (a) -> a[random_integer 0, a.length - 1] n_times = (n, f) -> [...Array(n).keys()].map f remove_non_chinese_characters = (a) -> a.replace /[^\p{Script=Han}]/ug, "" traditional_to_simplified = (a) -> hanzi_tools.simplify a pinyin_split2 = (a) -> a.replace(/[0-5]/g, (a) -> a + " ").trim().split " " median = (a) -> a.slice().sort((a, b) -> a - b)[Math.floor(a.length / 2)] sum = (a) -> a.reduce ((a, b) -> a + b), 0 mean = (a) -> sum(a) / a.length object_array_add = (object, key, value) -> if object[key] then object[key].push value else object[key] = [value] array_intersection = (a, b) -> a.filter (a) -> b.includes(a) write_csv_file = (path, data) -> csv = csv_stringify.stringify(data, {delimiter: " "}, on_error).trim() fs.writeFile path, csv, on_error delete_duplicates_stable = (a) -> result = [] existing = {} a.forEach (a) -> unless existing[a] existing[a] = true result.push a result delete_duplicates_stable_with_key = (a, key) -> result = [] existing = {} a.forEach (a) -> unless existing[a[key]] existing[a[key]] = true result.push a result array_shuffle = (a) -> n = a.length while n > 0 i = Math.floor Math.random() * n n -= 1 [a[n], a[i]] = [a[i], a[n]] a array_deduplicate_key = (a, get_key) -> existing = {} a.filter (a) -> key = get_key a if existing[key] then false else existing[key] = true true # https://en.wiktionary.org/wiki/Appendix:Unicode hanzi_unicode_ranges = [ ["30A0", "30FF"] # katakana used for some components ["2E80", "2EFF"] ["31C0", "31EF"] ["4E00", "9FFF"] ["3400", "4DBF"] ["20000", "2A6DF"] ["2A700", "2B73F"] ["2B740", "2B81F"] ["2B820", "2CEAF"] ["2CEB0", "2EBEF"] ["30000", "3134F"] ["31350", "323AF"] ["2EBF0", "2EE5F"] ] unicode_ranges_pattern = (a, is_reject) -> "[" + (if is_reject then "^" else "") + a.map((a) -> a.map((b) -> "\\u{#{b}}").join("-")).join("") + "]" unicode_ranges_regexp = (a, is_reject) -> new RegExp unicode_ranges_pattern(a, is_reject), "gu" hanzi_regexp = unicode_ranges_regexp hanzi_unicode_ranges non_hanzi_regexp = unicode_ranges_regexp hanzi_unicode_ranges, true hanzi_and_idc_regexp = unicode_ranges_regexp hanzi_unicode_ranges.concat([["2FF0", "2FFF"]]) non_pinyin_regexp = /[^a-z0-5]/g cedict_glossary = (a) -> filter_regexp = [ /^abbr\. for / /^also pr\. / /.ancient/ /ancient./ /[^()a-z0-9?':; ,.-]/ /.bird species./ /\(budd.+/ /.buddhism/ /buddhism./ /.buddhist/ /buddhist./ /^cl:/ /\(classical/ /\(\d+/ /\d+-\d+/ /\(in classical/ /.japan/ /japan./ /japanese/ /.korea/ /korea./ /\(old\)/ /\(onom/ /.sanskrit/ /sanskrit./ /^see also [^a-zA-Z]/ /^see [^a-zA-Z]/ /^surname / /.taiwan/ /taiwan./ /^taiwanese \. / /^taiwan pr./ /\(tw\)/ /^\(used in / /^used in / /variant of / /\(loanword/ /\(neologism/ ] a = a.split("/").map (a) -> a.toLowerCase().split(";") a = a.flat().map (a) -> a.trim() a.filter (a) -> !filter_regexp.some((b) -> a.match b) cedict_merge_definitions = (a) -> table = {} a.forEach (a, index) -> key = a[0] + "#" + a[1] if table[key] table[key][1][2] = table[key][1][2].concat a[2] else table[key] = [index, a] Object.values(table).sort((a, b) -> a[0] - b[0]).map((a) -> a[1]) cedict_additions = (a) -> # manual additions to the dictionary a.push ["你", "ni3", ["you"]] a cedict_filter_only = () -> # retains the original cedict format. cedict = fs.readFileSync "data/cedict_ts.u8", "utf-8" frequency_array = array_from_newline_file "data/frequency.csv", "utf-8" frequency = {} frequency_array.forEach (a, i) -> frequency[a] = i rows = cedict.split "\n" data = rows.map (line) -> if "#" is line[0] then return null line = line.trim() parsed = line.match(/^([^ ]+) ([^ ]+) \[([^\]]+)\] \/(.*)\//) word_traditional = parsed[1] word = parsed[2] if word.match(/[a-zA-Z0-9]/) then return null pinyin = parsed[3] pinyin = pinyin.split(" ").map (a) -> pinyin_utils.markToNumber(a).replace("u:", "ü").replace("35", "3").replace("45", "4").replace("25", "2") pinyin = pinyin.join("").toLowerCase() glossary = cedict_glossary(parsed[4]).join("/") line = [word_traditional, word, "[#{pinyin}]", "/#{glossary}/"].join(" ") frequency = frequency[word] || (word.length + frequency_array.length) [frequency, line, word, word_traditional] if glossary.length data = data.filter (a) -> a data = data.sort (a, b) -> a[0] - b[0] cedict_filtered_lines = data.map (a) -> a[1] cedict_filtered = cedict_filtered_lines.join "\n" fs.writeFile "data/cedict-filtered.u8", cedict_filtered, on_error index_lines = [] index_lines_traditional = [] character_offset = 0 data.forEach (a) -> word = a[2] word_traditional = a[3] character_offset = cedict_filtered.indexOf("#{word_traditional} #{word} ", character_offset) index_lines.push "#{word},#{character_offset}" index_lines_traditional.push "#{word_traditional},#{character_offset}" index_lines = index_lines.concat index_lines_traditional fs.writeFile "data/cedict-filtered.idx", index_lines.join("\n"), on_error get_frequency_index = () -> # -> {"#{word}#{pinyin}": integer} frequency = array_from_newline_file "data/frequency-pinyin.csv", "utf-8" frequency_index = {} frequency.forEach (a, i) -> a = a.replace " ", "" frequency_index[a] = i unless frequency_index[a] frequency_index get_all_characters = () -> read_csv_file("data/character-strokes-composition.csv").map (a) -> a[0] display_all_characters = () -> console.log get_all_characters().join("") get_all_characters_and_pinyin = () -> # sorted by frequency result = [] a = read_csv_file "data/frequency-pinyin.csv" a.forEach (a) -> chars = split_chars a[0] pinyin = pinyin_split2 a[1] chars.forEach (a, i) -> result.push [a + pinyin[i], a, pinyin[i]] a = read_csv_file "data/table-of-general-standard-chinese-characters.csv" a.forEach (a) -> a[1].split(", ").forEach (pinyin) -> result.push [a[0] + pinyin, a[0], pinyin] delete_duplicates_stable_with_key(result, 0).map (a) -> [a[1], a[2].replace("u:", "ü")] get_frequency_characters_and_pinyin = () -> # with duplicates. use case: count character reading frequency result = [] a = read_csv_file "data/frequency-pinyin.csv" a.forEach (a) -> chars = split_chars a[0] pinyin = pinyin_split2 a[1] chars.forEach (a, i) -> result.push [a, pinyin[i]] result get_all_characters_sorted_by_frequency = () -> delete_duplicates_stable get_all_characters_and_pinyin().map (a) -> split_chars(a[0])[0] get_character_frequency_index = () -> # -> {character: integer} chars = get_all_characters_sorted_by_frequency() frequency_index = {} chars.forEach (a, i) -> frequency_index[a] = i frequency_index get_character_pinyin_frequency_index = () -> # -> {character + pinyin: integer} chars = get_frequency_characters_and_pinyin() result = {} index = 0 chars.forEach (a) -> key = a[0] + (a[1] || "") unless result[key] result[key] = index index += 1 result update_character_reading_count = () -> # counts how common different readings are for characters index = {} rows = [] chars = get_all_characters() chars_and_pinyin = get_frequency_characters_and_pinyin() chars.forEach (a) -> chars_and_pinyin.forEach (b) -> if a[0] is b[0] key = a[0] + b[1] if index[key] != undefined then index[key] += 1 else index[key] = 0 Object.keys(index).forEach (a) -> count = index[a] if count then rows.push [a[0], a.slice(1), count] rows = rows.sort (a, b) -> b[2] - a[2] write_csv_file "data/character-reading-count.csv", rows sort_by_frequency = (frequency_index, word_key, pinyin_key, data) -> data = data.sort (a, b) -> fa = frequency_index[a[word_key] + a[pinyin_key]] fb = frequency_index[b[word_key] + b[pinyin_key]] if fa is undefined and fb is undefined a[word_key].length - b[word_key].length else if fa is undefined 1 else if fb is undefined -1 else fa - fb sort_by_character_frequency = (frequency_index, character_key, data) -> data = data.sort (a, b) -> fa = frequency_index[a[character_key]] fb = frequency_index[b[character_key]] if fa is undefined and fb is undefined a[character_key].length - b[character_key].length else if fa is undefined 1 else if fb is undefined -1 else fa - fb update_cedict_csv = () -> cedict = fs.readFileSync "data/cedict_ts.u8", "utf-8" frequency_index = get_frequency_index() lines = cedict.split "\n" data = lines.map (line) -> if "#" is line[0] then return null line = line.trim() parsed = line.match(/^([^ ]+) ([^ ]+) \[([^\]]+)\] \/(.*)\//) word = parsed[2] if word.match(/[a-zA-Z0-9]/) then return null pinyin = parsed[3] pinyin = pinyin.split(" ").map (a) -> pinyin_utils.markToNumber(a).replace("u:", "ü").replace("35", "3").replace("45", "4").replace("25", "2") pinyin = pinyin.join("").toLowerCase() glossary = cedict_glossary parsed[4] unless glossary.length then return null [word, pinyin, glossary] data = data.filter (a) -> a data = cedict_additions data data = cedict_merge_definitions data data.forEach (a) -> a[2] = a[2].join "; " data = sort_by_frequency frequency_index, 0, 1, data data = data.filter (a, index) -> index < 3000 || a[0].length < 3 test = () -> example1 = data.findIndex((a) => a[0] is "猫") example2 = data.findIndex((a) => a[0] is "熊猫") throw "test failed" unless example1 < example2 #test() write_csv_file "data/cedict.csv", data dictionary_cedict_to_json = (data) -> JSON.stringify data.map (a) -> a[2] = a[2].split "/" a update_dictionary = () -> words = read_csv_file "data/cedict.csv" words = dictionary_cedict_to_json words js = fs.readFileSync "js/dictionary.js", "utf8" js = js.replace "__word_data__", words html = fs.readFileSync "html/hanyu-dictionary-template.html", "utf8" html = html.replace "__script__", js.trim() fs.writeFile "html/hanyu-dictionary.html", html, on_error clean_frequency_list = () -> frequency_array = array_from_newline_file "data/frequency.csv", "utf-8" frequency_array = frequency_array.filter (a) -> traditional_to_simplified remove_non_chinese_characters a frequency_array.forEach (a) -> console.log a update_hsk = () -> files = fs.readdirSync "data/hsk" data = files.map (a) -> read_csv_file("data/hsk/#{a}", "\t") data = data.flat(1).map (a) -> pinyin = pinyin_split2(a[2]).map(pinyin_utils.markToNumber).join("").toLowerCase() [a[1], pinyin] write_csv_file "data/hsk.csv", data dictionary_index_word_f = (lookup_index) -> dictionary = {} read_csv_file("data/cedict.csv").forEach (a) -> object_array_add dictionary, a[lookup_index], a (a) -> dictionary[a] dictionary_index_word_pinyin_f = () -> dictionary = {} word_index = 0 pinyin_index = 1 words = read_csv_file "data/cedict.csv" words.forEach (a) -> word = a[word_index] key = a[word_index] + a[pinyin_index] object_array_add dictionary, key, a object_array_add dictionary, word, a (word, pinyin) -> dictionary[word + pinyin] update_frequency_pinyin = () -> dictionary_lookup = dictionary_index_word_pinyin_f 0, 1 frequency = array_from_newline_file "data/frequency.csv", "utf-8" hsk = read_csv_file "data/hsk.csv" hsk_index = {} hsk.forEach (a) -> return if hsk_index[a[0]] pinyin = a[1] pinyin += "5" unless /[0-5]$/.test pinyin hsk_index[a[0]] = pinyin frequency_pinyin = frequency.map (a) -> [a, (hsk_index[a] || "")] rows = frequency_pinyin.map (a) -> translation = dictionary_lookup a[0], a[1] return [] unless translation [a[0], translation[0][1], translation[0][2]] rows = rows.filter (a) -> 3 is a.length write_csv_file "data/frequency-pinyin-translation.csv", rows rows = rows.map (a) -> [a[0], a[1]] write_csv_file "data/frequency-pinyin.csv", rows mark_to_number = (a) -> a.split(" ").map((a) -> pinyin_split2(a).map(pinyin_utils.markToNumber).join("")).join(" ") find_multiple_word_matches = (a, lookup_index, translation_index, split_syllables) -> # for each space separated element, find all longest most frequent words with the pronunciation. dictionary_lookup = dictionary_index_word_f lookup_index results = [] a.split(" ").forEach (a) -> syllables = split_syllables a max_word_length = 5 per_length = (i, j) -> syllables.slice(i, j).join("") per_syllable = (i) -> end = Math.min(i + max_word_length, syllables.length) + 1 per_length i, j for j in [(i + 1)...end] candidates = (per_syllable i for i in [0...syllables.length]) i = 0 while i < candidates.length matches = [] j = 0 reversed_candidates = candidates[i].toReversed() while j < reversed_candidates.length translations = dictionary_lookup reversed_candidates[j] if translations matches.push translations.map((a) -> a[translation_index]).join "/" break j += 1 if matches.length results.push matches[0] i += reversed_candidates.length - j else results.push candidates[i][0] i += 1 results.join " " dsv_add_translations_with_pinyin = (word_index, pinyin_index) -> dictionary_lookup = dictionary_index_word_pinyin_f 0, 1 rows = read_csv_file(0).map (a) -> translations = dictionary_lookup a[word_index], a[pinyin_index] return a unless translations a.concat [translations[0][2]] console.log csv_stringify.stringify(rows, {delimiter: " "}, on_error).trim() dsv_add_translations = (word_index) -> dictionary_lookup = dictionary_index_word_f() rows = read_csv_file(0).map (a) -> translations = dictionary_lookup a[word_index] return a unless translations a.concat [translations[0][1], translations[0][2]] write_csv_file 0, rows dsv_mark_to_number = (pinyin_index) -> rows = read_csv_file(0).map (a) -> a[pinyin_index] = mark_to_number a[pinyin_index] a write_csv_file 0, rows update_hsk_pinyin_translations = () -> dictionary_lookup = dictionary_index_word_pinyin_f 0, 1 rows = read_csv_file("data/hsk.csv").map (a) -> translations = dictionary_lookup a[0], a[1] return a unless translations a.concat [translations[0][2]] write_csv_file "data/hsk-pinyin-translation.csv", rows pinyin_to_hanzi = (a) -> a = a.replace(non_pinyin_regexp, " ").trim() find_multiple_word_matches a, 1, 0, pinyin_split2 hanzi_to_pinyin = (a) -> a = a.replace(non_hanzi_regexp, " ").trim() find_multiple_word_matches a, 0, 1, split_chars dsv_add_example_words = () -> dictionary = dictionary_index_word_pinyin_f 0, 1 words = read_csv_file "data/frequency-pinyin-translation.csv" rows = read_csv_file(0).map (a) -> char_words = words.filter((b) -> b[0].includes a[0]) unless char_words.length char_words = dictionary(a[0], a[1]) || [] a.push char_words.slice(0, 5).map((b) -> b.join(" ")).join("\n") a write_csv_file 0, rows update_characters_by_pinyin = () -> by_pinyin = {} chars = get_all_characters_and_pinyin().filter((a) -> !a[1].endsWith("5")) chars.forEach (a) -> object_array_add by_pinyin, a[1], a[0] rows = Object.keys(by_pinyin).map (a) -> [a, by_pinyin[a].join("")] rows = rows.sort (a, b) -> a[0].localeCompare(b[0]) || b[1].length - a[1].length write_csv_file "data/characters-by-pinyin.csv", rows # only common characters common_limit = 2000 by_pinyin = {} chars = get_all_characters_and_pinyin().filter((a) -> !a[1].endsWith("5")) chars = chars.slice(0, common_limit) chars.forEach (a) -> object_array_add by_pinyin, a[1], a[0] rows = Object.keys(by_pinyin).map (a) -> [a, by_pinyin[a].join("")] rows = rows.sort (a, b) -> a[0].localeCompare(b[0]) || b[1].length - a[1].length write_csv_file "data/characters-by-pinyin-common.csv", rows http_get = (url) -> new Promise (resolve, reject) -> http.get url, (response) -> data = [] response.on "data", (a) -> data.push a response.on "end", () -> resolve Buffer.concat(data).toString() response.on "error", (error) -> reject error sort_by_array_with_index = (a, sorting, index) -> a.sort (a, b) -> sorting.indexOf(a[index]) - sorting.indexOf(b[index]) index_key_value = (a, key_key, value_key) -> b = {} a.forEach (a) -> b[a[key_key]] = a[value_key] b get_compositions_index = () -> index_key_value read_csv_file("data/character-strokes-composition.csv"), 0, 2 get_full_compositions = () -> # also include compositions of components per entry compositions_index = get_compositions_index() decompose = (a) -> parts = compositions_index[a] if parts parts = [...parts] [a].concat(parts, parts.map(decompose)) else [a] Object.keys(compositions_index).map (a) -> parts = decompose(a).flat(Infinity) [parts[0], delete_duplicates(parts.slice(1))] #[parts[0], parts.slice(1)] get_full_compositions_index = () -> index_key_value get_full_compositions(), 0, 1 get_stroke_count_index = (a) -> index_key_value read_csv_file("data/character-strokes-composition.csv"), 0, 1 update_character_overlap = () -> # 大犬太 草早旱 config = { min_overlap: 0.7 min_component_stroke_count: 0 max_stroke_count_difference: 0.6 } stroke_count_index = get_stroke_count_index() compositions = get_full_compositions() compositions = compositions.map (a) -> a1 = a[1].filter (a) -> (stroke_count_index[a] || 1) > config.min_component_stroke_count && a.match(hanzi_regexp) [a[0], a1] compositions = compositions.filter (a) -> a[1].length similarities = compositions.map (a) -> #return [] unless a[0] == "大" #return [] unless a[0] == "草" similarities = compositions.map (b) -> unless a[0] == b[0] b_compositions = b[1].filter (b) -> a[0] != b intersection = array_intersection a[1], b_compositions overlap = intersection.length / Math.max(a[1].length, b_compositions.length) a_strokes = stroke_count_index[a[0]] b_strokes = stroke_count_index[b[0]] stroke_difference = Math.abs(b_strokes - a_strokes) stroke_difference_ratio = stroke_difference / a_strokes if overlap > config.min_overlap && (stroke_difference < 4 || stroke_difference_ratio < config.max_stroke_count_difference) #console.log a[0], b[0], overlap, stroke_difference_ratio, a[1].join(""), b_compositions.join("") [a[0], b[0], overlap, stroke_difference] similarities = similarities.filter (a) -> a similarities.sort (a, b) -> b[2] - a[2] || a[3] - b[3] similarities = similarities.filter (a) -> a.length similarities = similarities.map (a) -> b = a.map (a) -> a[1] [a[0][0], b.join("")] similarities = similarities.sort (a, b) -> b[1].length - a[1].length write_csv_file "data/character-overlap.csv", similarities character_frequency_index = get_character_frequency_index() similarities_common = similarities.map (a) -> [a[0], split_chars(a[1]).filter((a) -> (character_frequency_index[a] || 9000) < 4000).join("")] similarities_common = similarities_common.filter (a) -> a[1].length similarities_common = similarities_common.sort (a, b) -> b[1].length - a[1].length write_csv_file "data/character-overlap-common.csv", similarities_common get_character_reading_count_index = () -> result = {} read_csv_file("data/character-reading-count.csv").forEach (a) -> result[a[0] + a[1]] = parseInt a[2] result get_character_syllables_tones_count_index = () -> result = {} read_csv_file("data/syllables-tones-character-count.csv").forEach (a) -> result[a[0]] = parseInt a[1] result get_character_example_words_f = () -> dictionary = dictionary_index_word_pinyin_f 0, 1 words = read_csv_file "data/frequency-pinyin-translation.csv" (char, pinyin) -> char_word = words.find((b) -> b[0] is char) unless char_word char_word = dictionary(char, pinyin) char_word = char_word[0] if char_word char_words = if char_word then [char_word] else [] char_words.concat words.filter (b) -> b[0].includes(char) && b[0] != char sort_standard_character_readings = () -> reading_count_index = get_character_reading_count_index() path = "data/table-of-general-standard-chinese-characters.csv" rows = read_csv_file(path).map (a) -> char = a[0] pinyin = a[1].split(", ").map (a) -> if a.match(/[0-5]$/) then a else a + "5" pinyin = pinyin.sort (a, b) -> (reading_count_index[char + b] || 0) - (reading_count_index[char + a] || 0) a[1] = pinyin.join ", " a write_csv_file path, rows get_character_by_reading_index = () -> result = {} read_csv_file("data/table-of-general-standard-chinese-characters.csv").forEach (a) -> pinyin = a[1].split(", ")[0] object_array_add result, pinyin, a[0] result add_sort_field = (rows) -> a.push i for a, i in rows rows update_pinyin_learning = () -> # pinyin, word_choices -> word, translation options = words_per_char: 3 word_choices: 5 character_frequency_index = get_character_frequency_index() get_character_example_words = get_character_example_words_f() standard_chars = read_csv_file("data/table-of-general-standard-chinese-characters.csv") chars = standard_chars.map (a) -> [a[0], a[1].split(", ")[0]] chars = sort_by_character_frequency character_frequency_index, 0, chars rows = for a in chars a = get_character_example_words(a[0], a[1]) if 1 < a.length then a = a.slice 1, options.words_per_char + 1 [b[1], b[0], b[2]] for b in a rows = rows.flat 1 rows = array_deduplicate_key rows, (a) -> a[1] add_word_choices = (rows) -> rows.map (a) -> tries = 30 alternatives = [a[1]] while tries && alternatives.length < options.word_choices alternative = random_element rows if a[1].length == alternative[1].length && a[0] != alternative[0] && !alternatives.includes(alternative[1]) alternatives.push alternative[1] tries -= 1 a.push array_shuffle(alternatives).join(" ") a rows = add_sort_field add_word_choices rows write_csv_file "data/pinyin-learning.csv", rows update_character_learning = () -> character_frequency_index = get_character_frequency_index() reading_count_index = get_character_reading_count_index() character_by_reading_index = get_character_by_reading_index() get_character_example_words = get_character_example_words_f() compositions_index = get_compositions_index() rows = read_csv_file("data/table-of-general-standard-chinese-characters.csv").map (a) -> [a[0], a[1].split(", ")[0]] rows = array_deduplicate_key rows, (a) -> a[0] rows = sort_by_character_frequency character_frequency_index, 0, rows syllables = delete_duplicates rows.map((a) -> a[1].split(", ")).flat() add_guess_readings = (rows) -> # example: "kan3 (5) chui1 (2) lun3 (2) hong1 (6) tang4 (2) du1 (5) xie3 (2) fan2 (16)" syllable_count_index = get_character_syllables_tones_count_index() max_guess_readings = 5 rows.map (a) -> # add for each guess reading the number of other characters with this reading alternatives = n_times max_guess_readings - 1, (n) -> random_element syllables alternatives = delete_duplicates array_shuffle [a[1]].concat alternatives alternatives = alternatives.map (b) -> b + " (" + (syllable_count_index[b] || 1) + ")" a.push alternatives.join " " a #rows = add_guess_readings rows add_same_reading_characters = (rows) -> max_same_reading_characters = 10 rows.map (a) -> b = array_shuffle (character_by_reading_index[a[1]] || []).filter((b) -> a[0] != b) a.push b.slice(0, max_same_reading_characters).join "" a rows = add_same_reading_characters rows add_syllable_counts = (rows) -> rows.map (a) -> # must be run after add_guess_readings. # add for each possible reading the number of words with this character and reading a[1] = a[1].split(", ").map((b) -> b + " (" + (reading_count_index[a[0] + b] || 1) + ")").join(", ") a # add compositions rows.map (a) -> b = compositions_index[a[0]] a.push b if b a rows = add_sort_field rows # write write_csv_file "data/character-learning-without-translations.csv", rows add_example_words = (rows) -> rows.map (a) -> words = get_character_example_words a[0], a[1] a.push words.slice(1, 5).map((b) -> b[0]).join " " a.push words.slice(0, 5).map((b) -> b.join(" ")).join "\n" a rows = add_example_words rows write_csv_file "data/character-learning.csv", rows update_syllables_character_count = () -> # number of characters with the same reading chars = read_csv_file("data/characters-by-pinyin.csv").map (a) -> [a[0], a[1].length] chars_common = read_csv_file("data/characters-by-pinyin-common.csv").map (a) -> [a[0], a[1].length] chars_without_tones = chars.map (a) -> [a[0].replace(/[0-5]/g, ""), a[1]] get_data = (chars) -> counts = {} chars.forEach (a) -> if counts[a[0]] then counts[a[0]] += a[1] else counts[a[0]] = a[1] chars = chars.map (a) -> a[0] chars = delete_duplicates_stable chars chars.map((a) -> [a, counts[a]]).sort (a, b) -> b[1] - a[1] write_csv_file "data/syllables-tones-character-count.csv", get_data(chars) write_csv_file "data/syllables-character-count.csv", get_data(chars_without_tones) write_csv_file "data/syllables-tones-character-count-common.csv", get_data(chars_common) grade_text_files = (paths) -> paths.forEach (a) -> console.log grade_text(fs.readFileSync(a, "utf-8")) + " " + path.basename(a) grade_text = (a) -> chars = delete_duplicates a.match hanzi_regexp frequency_index = get_character_frequency_index() all_chars_count = Object.keys(frequency_index).length frequencies = chars.map((a) -> frequency_index[a] || all_chars_count).sort((a, b) -> a - b) count_score = chars.length / all_chars_count rarity_score = median(frequencies.splice(-10)) / all_chars_count Math.max 1, Math.round(10 * (count_score + rarity_score)) get_wiktionary_data = (char) -> body = await http_get "https://en.wiktionary.org/wiki/#{char}" html = html_parser.parse body b = html.querySelectorAll "p" b = b.map (a) -> strokes = a.textContent.match /(\d+) stroke/ strokes = strokes && parseInt(strokes[1], 10) composition = a.querySelector "a[title=\"w:Chinese character description languages\"]" if composition composition = composition.parentNode.parentNode.textContent composition = composition.match(/composition (.*)\)/)[1] composition = (composition.split(" or ")[0].match(hanzi_and_idc_regexp) || []).join("") [char, strokes, composition] b = b.filter (a) -> a[1] || a[2] b.flat() update_extra_stroke_counts = () -> data = read_csv_file "data/extra-components.csv" data = data.sort (a, b) -> b.length - a.length data = delete_duplicates_stable_with_key data, 0 data = data.filter (a) -> a.length > 1 data = data.map (a) -> [a[0], parseInt(a[1], 10)] data = data.sort (a, b) -> a[1] - b[1] write_csv_file "data/extra-stroke-counts.csv", data update_compositions = (start_index, end_index) -> chars = read_csv_file "data/character-strokes-composition.csv" chars = chars.filter (a) -> "1" != a[1] chars = chars.slice start_index, end_index batch_size = 10 batches_count = Math.ceil chars.length / batch_size batches = [] i = 0 while i < batches_count batches.push chars.slice i * batch_size, (i + 1) * batch_size i += 1 batches.forEach (a) -> requests = Promise.all a.map (b) -> get_wiktionary_data b[0] requests.then (b) -> b.forEach (b, i) -> c = a[i] if (b[1] && b[1] != parseInt(c[1], 10)) || (b[2] && b[2].length >= c[2].length && b[2] != c[2]) c[1] = b[1] c[2] = b[2] console.log c.join " " add_new_data = () -> chars = read_csv_file "data/character-strokes-composition.csv" new_data = read_csv_file("new-data").filter (a) -> a[0].length all = chars.concat new_data all_index = {} all.forEach (a) -> all_index[a[0]] = a all = Object.values(all_index).sort (a, b) -> a[1] - b[1] || a[0].localeCompare(b[0]) write_csv_file "data/character-strokes-composition-new.csv", all sort_data = () -> chars = read_csv_file "data/character-strokes-composition.csv" chars = chars.sort (a, b) -> a[1] - b[1] || a[0].localeCompare(b[0]) write_csv_file "data/character-strokes-composition-new.csv", chars find_component_repetitions = () -> chars = read_csv_file "data/character-strokes-composition.csv" chars = chars.forEach (a) -> if a[2] b = a[2].replace non_hanzi_regexp, "" if 1 == delete_duplicates(split_chars(b)).length console.log a[0], b run = () -> #console.log "コ刂".match hanzi_regexp #find_component_repetitions() #console.log non_hanzi_regexp #sort_data() #add_new_data() #update_compositions 7000, 9000 #write_csv_file() #find_missing_compositions() #get_full_compositions() #data = delete_duplicates(data).sort((a, b) -> a.localeCompare(b)) #fs.writeFileSync("data/extra-components-new.csv", data.join("\n")) #filter_common_characters() #sort_standard_character_readings() update_syllables_character_count() #update_character_reading_count() #update_character_learning() #update_syllables_with_tones_by_reading() #console.log "/" + hanzi_unicode_ranges_regexp + "/gu" #display_all_characters() #update_syllables_by_reading() #update_compositions() module.exports = { update_character_overlap cedict_filter_only clean_frequency_list dsv_add_translations dsv_add_example_words dsv_mark_to_number update_cedict_csv update_dictionary update_frequency_pinyin update_hsk update_hsk_pinyin_translations traditional_to_simplified pinyin_to_hanzi hanzi_to_pinyin mark_to_number update_characters_by_pinyin update_character_learning update_pinyin_learning grade_text grade_text_files run }