csv_parse = require "csv-parse/sync"
csv_stringify = require "csv-stringify/sync"
coffee = require "coffeescript"
fs = require "fs"
hanzi_tools = require "hanzi-tools"
html_parser = require "node-html-parser"
http =  require "https"
node_path = require "path"
pinyin_split = require "pinyin-split"
pinyin_utils = require "pinyin-utils"
{DOMParser, XMLSerializer} = require "xmldom"
#scraper = require "table-scraper"
read_text_file = (a) -> fs.readFileSync a, "utf8"
read_csv_file = (path, delimiter) -> csv_parse.parse read_text_file(path), {delimiter: delimiter || " ", relax_column_count: true}
replace_placeholders = (text, mapping) -> text.replace /__(.*?)__/g, (_, k) -> mapping[k] or ""
array_from_newline_file = (path) -> read_text_file(path).toString().trim().split("\n")
on_error = (a) -> if a then console.error a
delete_duplicates = (a) -> [...new Set(a)]
split_chars = (a) -> [...a]
random_integer = (min, max) -> Math.floor(Math.random() * (max - min + 1)) + min
random_element = (a) -> a[random_integer 0, a.length - 1]
n_times = (n, f) -> [...Array(n).keys()].map f
remove_non_chinese_characters = (a) -> a.replace /[^\p{Script=Han}]/ug, ""
traditional_to_simplified = (a) -> hanzi_tools.simplify a
pinyin_split2 = (a) -> a.replace(/[0-5]/g, (a) -> a + " ").trim().split " "
median = (a) -> a.slice().sort((a, b) -> a - b)[Math.floor(a.length / 2)]
sum = (a) -> a.reduce ((a, b) -> a + b), 0
mean = (a) -> sum(a) / a.length
object_array_add = (object, key, value) -> if object[key] then object[key].push value else object[key] = [value]
object_array_add_unique = (object, key, value) ->
  if object[key] then object[key].push value unless object[key].includes value
  else object[key] = [value]
array_intersection = (a, b) -> a.filter (a) -> b.includes(a)

write_csv_file = (path, data) ->
  csv = csv_stringify.stringify(data, {delimiter: " "}, on_error).trim()
  fs.writeFile path, csv, on_error

delete_duplicates_stable = (a) ->
  result = []
  existing = {}
  a.forEach (a) ->
    unless existing[a]
      existing[a] = true
      result.push a
  result

delete_duplicates_stable_with_key = (a, key) ->
  result = []
  existing = {}
  a.forEach (a) ->
    unless existing[a[key]]
      existing[a[key]] = true
      result.push a
  result

lcg = (seed) ->
  m = 2 ** 31
  a = 1103515245
  c = 12345
  state = seed
  ->
    state = (a * state + c) % m
    state / m

array_shuffle = (a) ->
  rand = lcg(23465700980)
  n = a.length
  while n > 0
    i = Math.floor rand() * n
    n -= 1
    [a[n], a[i]] = [a[i], a[n]]
  a

array_deduplicate_key = (a, get_key) ->
  existing = {}
  a.filter (a) ->
    key = get_key a
    if existing[key] then false
    else
      existing[key] = true
      true
# https://en.wiktionary.org/wiki/Appendix:Unicode
hanzi_unicode_ranges = [
  ["30A0", "30FF"]  # katakana used for some components
  ["2E80", "2EFF"]
  ["31C0", "31EF"]
  ["4E00", "9FFF"]
  ["3400", "4DBF"]
  ["20000", "2A6DF"]
  ["2A700", "2B73F"]
  ["2B740", "2B81F"]
  ["2B820", "2CEAF"]
  ["2CEB0", "2EBEF"]
  ["30000", "3134F"]
  ["31350", "323AF"]
  ["2EBF0", "2EE5F"]
]

unicode_ranges_pattern = (a, is_reject) -> "[" + (if is_reject then "^" else "") + a.map((a) -> a.map((b) -> "\\u{#{b}}").join("-")).join("") + "]"
unicode_ranges_regexp = (a, is_reject) -> new RegExp unicode_ranges_pattern(a, is_reject), "gu"
hanzi_regexp = unicode_ranges_regexp hanzi_unicode_ranges
non_hanzi_regexp = unicode_ranges_regexp hanzi_unicode_ranges, true
hanzi_and_idc_regexp = unicode_ranges_regexp hanzi_unicode_ranges.concat([["2FF0", "2FFF"]])
non_pinyin_regexp = /[^a-z0-5]/g

get_word_frequency_index = () ->
  # -> {"#{word}#{pinyin}": integer}
  frequency = array_from_newline_file "data/words-by-frequency.txt"
  frequency_index = {}
  frequency.forEach (a, i) ->
    a = a.replace " ", ""
    frequency_index[a] = i unless frequency_index[a]
  frequency_index

get_word_frequency_index_with_pinyin = () ->
  # -> {"#{word}#{pinyin}": integer}
  frequency = array_from_newline_file "data/words-by-frequency-with-pinyin.csv"
  frequency_index = {}
  frequency.forEach (a, i) ->
    a = a.replace " ", ""
    frequency_index[a] = i unless frequency_index[a]
  frequency_index

get_all_standard_characters = () -> read_csv_file("data/table-of-general-standard-chinese-characters.csv").map (a) -> a[0]
get_all_standard_characters_with_pinyin = () ->
  a = read_csv_file("data/table-of-general-standard-chinese-characters.csv").map (a) -> [a[0], a[1].split(",")[0]]
  b = read_csv_file("data/additional-characters.csv").filter((a) -> !character_exclusions.includes(a[0])).map (a) -> [a[0], a[1].split(",")[0]]
  a.concat b
get_all_characters = () -> read_csv_file("data/characters-strokes-decomposition.csv").map (a) -> a[0]
display_all_characters = () -> console.log get_all_characters().join("")

get_all_characters_with_pinyin = () ->
  dict = dictionary_index_word_f 0
  result = []
  chars = {}
  for a in read_csv_file "data/table-of-general-standard-chinese-characters.csv"
    pinyin = a[1].split(", ")[0]
    chars[a[0]] = pinyin
  for a in read_csv_file "data/additional-characters.csv"
    chars[a[0]] = a[1] unless chars[a[0]]
  for a in read_csv_file "data/characters-strokes-decomposition.csv"
    pinyin = dict(a[0])?[0][1]
    chars[a[0]] = pinyin if pinyin && !chars[a[0]]
    continue if a.length < 3
    for b in split_chars(a[2])
      continue unless b.match hanzi_regexp
      pinyin = dict(b)?[0][1]
      chars[b] = pinyin if pinyin && !chars[b]
  data = ([a, b] for a, b of chars)
  char_index = split_chars read_text_file("data/characters-by-frequency.txt").trim()
  data.sort (a, b) ->
    ia = char_index.indexOf a[0]
    ib = char_index.indexOf b[0]
    (if ia is -1 then Infinity else ia) - (if ib is -1 then Infinity else ib)
  data

get_character_by_reading_index = () ->
  chars = get_all_characters_with_pinyin()
  result = {}
  chars.forEach (a) -> object_array_add result, a[1], a[0]
  result

get_frequency_characters_and_pinyin = () ->
  # with duplicates. use case: count character reading frequency
  result = []
  a = read_csv_file "data/words-by-frequency-with-pinyin.csv"
  a.forEach (a) ->
    chars = split_chars a[0]
    pinyin = pinyin_split2 a[1]
    chars.forEach (a, i) -> result.push [a, pinyin[i]]
  result

get_all_characters_sorted_by_frequency = () ->
  delete_duplicates_stable get_all_characters_with_pinyin().map (a) -> split_chars(a[0])[0]

get_character_frequency_index = () ->
  # -> {character: integer}
  chars = get_all_characters_sorted_by_frequency()
  frequency_index = {}
  chars.forEach (a, i) -> frequency_index[a] = i
  frequency_index

get_character_pinyin_frequency_index = () ->
  # -> {character + pinyin: integer}
  chars = get_frequency_characters_and_pinyin()
  result = {}
  index = 0
  chars.forEach (a) ->
    key = a[0] + (a[1] || "")
    unless result[key]
      result[key] = index
      index += 1
  result

update_character_reading_count = () ->
  # counts how common different readings are for characters
  index = {}
  rows = []
  chars = get_all_characters()
  chars_and_pinyin = get_frequency_characters_and_pinyin()
  chars.forEach (a) ->
    chars_and_pinyin.forEach (b) ->
      if a[0] is b[0]
        key = a[0] + b[1]
        if index[key] != undefined then index[key] += 1
        else index[key] = 0
  Object.keys(index).forEach (a) ->
    count = index[a]
    if count then rows.push [a[0], a.slice(1), count]
  rows = rows.sort (a, b) -> b[2] - a[2]
  write_csv_file "data/characters-pinyin-count.csv", rows

sort_by_index_and_character_f = (index, character_key) ->
  # {character: integer, ...}, any -> function(a, b)
  f = sort_by_character_f index
  (a, b) -> f a[character_key], b[character_key]

sort_by_character_f = (index) ->
  (a, b) ->
    ia = index[a]
    ib = index[b]
    if ia is undefined and ib is undefined
      (a.length - b.length) || a.localeCompare(b) || b.localeCompare(a)
    else if ia is undefined then 1
    else if ib is undefined then -1
    else ia - ib

sort_by_character_frequency = (frequency_index, character_key, data) ->
  data.sort sort_by_index_and_character_f frequency_index, character_key

sort_by_stroke_count = (stroke_count_index, character_key, data) ->
  data.sort sort_by_index_and_character_f stroke_count_index, character_key

sort_by_word_frequency_with_pinyin = (frequency_index, word_key, pinyin_key, data) ->
  data.sort (a, b) ->
    fa = frequency_index[a[word_key] + a[pinyin_key]]
    fb = frequency_index[b[word_key] + b[pinyin_key]]
    if fa is undefined and fb is undefined
      a[word_key].length - b[word_key].length
    else if fa is undefined
      1
    else if fb is undefined
      -1
    else
      fa - fb

sort_by_word_frequency = (frequency_index, word_key, data) ->
  data.sort (a, b) ->
    fa = frequency_index[a[word_key]]
    fb = frequency_index[b[word_key]]
    if fa is undefined and fb is undefined
      a[word_key].length - b[word_key].length
    else if fa is undefined
      1
    else if fb is undefined
      -1
    else
      fa - fb

dictionary_cedict_to_json = (data) ->
  JSON.stringify data.map (a) ->
    a[2] = a[2].split "/"
    a.push a[1].replace /[0-4]/g, ""
    a

update_dictionary = () ->
  word_data = read_csv_file "data/cedict.csv"
  word_data = dictionary_cedict_to_json word_data
  character_data = read_text_file "data/characters-svg.json"
  script = read_text_file "src/dictionary.coffee"
  script = coffee.compile(script, bare: true).trim()
  script = replace_placeholders script, {word_data, character_data}
  font = read_text_file "src/NotoSansSC-Light.ttf.base64"
  html = read_text_file "src/hanyu-dictionary-template.html"
  html = replace_placeholders html, {font, script}
  fs.writeFileSync "compiled/hanyu-dictionary.html", html

clean_frequency_list = () ->
  frequency_array = array_from_newline_file "data/words-by-frequency.txt"
  frequency_array = frequency_array.filter (a) ->
    traditional_to_simplified remove_non_chinese_characters a
  frequency_array.forEach (a) -> console.log a

dictionary_index_word_f = (lookup_index) ->
  dictionary = {}
  read_csv_file("data/cedict.csv").forEach (a) -> object_array_add dictionary, a[lookup_index], a
  (a) -> dictionary[a]

dictionary_index_word_pinyin_f = () ->
  dictionary = {}
  word_index = 0
  pinyin_index = 1
  words = read_csv_file "data/cedict.csv"
  words.forEach (a) ->
    word = a[word_index]
    key = a[word_index] + a[pinyin_index]
    object_array_add dictionary, key, a
    object_array_add dictionary, word, a
  (word, pinyin) -> dictionary[word + pinyin]

mark_to_number = (a) ->
  a.split(" ").map((a) -> pinyin_split2(a).map(pinyin_utils.markToNumber).join("")).join(" ")

find_multiple_word_matches = (a, lookup_index, translation_index, split_syllables) ->
  # for each space separated element, find all longest most frequent words with the pronunciation.
  dictionary_lookup = dictionary_index_word_f lookup_index
  results = []
  a.split(" ").forEach (a) ->
    syllables = split_syllables a
    max_word_length = 5
    per_length = (i, j) -> syllables.slice(i, j).join("")
    per_syllable = (i) ->
      end = Math.min(i + max_word_length, syllables.length) + 1
      per_length i, j for j in [(i + 1)...end]
    candidates = (per_syllable i for i in [0...syllables.length])
    i = 0
    while i < candidates.length
      matches = []
      j = 0
      reversed_candidates = candidates[i].toReversed()
      while j < reversed_candidates.length
        translations = dictionary_lookup reversed_candidates[j]
        if translations
          matches.push translations.map((a) -> a[translation_index]).join "/"
          break
        j += 1
      if matches.length
        results.push matches[0]
        i += reversed_candidates.length - j
      else
        results.push candidates[i][0]
        i += 1
  results.join " "

pinyin_to_hanzi = (a) ->
  a = a.replace(non_pinyin_regexp, " ").trim()
  find_multiple_word_matches a, 1, 0, pinyin_split2

hanzi_to_pinyin = (a) ->
  a = a.replace(non_hanzi_regexp, " ").trim()
  find_multiple_word_matches a, 0, 1, split_chars

get_character_pinyin_index = ->
  index = {}
  chars = get_all_characters_with_pinyin().filter((a) -> !a[1].endsWith("5"))
  chars.forEach (a) -> index[a[0]] = a[1].split(",")[0]
  index

get_character_tone_index = ->
  index = {}
  chars = get_all_characters_with_pinyin().filter((a) -> !a[1].endsWith("5"))
  chars.forEach (a) -> index[a[0]] = parseInt a[1][a[1].length - 1]
  index

get_characters_by_pinyin_rows = ->
  by_pinyin = {}
  chars = get_all_characters_with_pinyin().filter((a) -> !a[1].endsWith("5"))
  chars.forEach (a) -> object_array_add by_pinyin, a[1], a[0]
  rows = Object.keys(by_pinyin).map (a) -> [a, by_pinyin[a]]
  rows.sort (a, b) -> a[0].localeCompare(b[0]) || b[1].length - a[1].length

all_syllables = """
a ai an ang ao ba bai ban bang bao bei ben beng bi bian biang biao bie bin bing bo bu
ca cai can cang cao ce cei cen ceng cha chai chan chang chao che chen cheng chi chong
chou chu chua chuai chuan chuang chui chun chuo ci cong cou cu cuan cui cun cuo da dai
dan dang dao de dei den deng di dian diao die ding diu dong dou du duan dui dun duo e ei
en eng er fa fan fang fei fen feng fo fou fu ga gai gan gang gao ge gei gen geng gong gou
gu gua guai guan guang gui gun guo ha hai han hang hao he hei hen heng hong hou hu hua
huai huan huang hui hun huo ji jia jian jiang jiao jie jin jing jiong jiu ju juan jue jun
ka kai kan kang kao ke kei ken keng kong kou ku kua kuai kuan kuang kui kun kuo la lai
lan lang lao le lei leng li lia lian liang liao lie lin ling liu lo long lou lu luan lun
luo lü lüe ma mai man mang mao me mei men meng mi mian miao mie min ming miu mo mou mu
na nai nan nang nao ne nei nen neng ni nian niang niao nie nin ning niu nong nou nu nuan
nuo nü nüe o ou pa pai pan pang pao pei pen peng pi pian piao pie pin ping po pou pu qi
qia qian qiang qiao qie qin qing qiong qiu qu quan que qun ran rang rao re ren reng ri
rong rou ru rua ruan rui run ruo sa sai san sang sao se sen seng sha shai shan shang shao
she shei shen sheng shi shou shu shua shuai shuan shuang shui shun shuo si song sou su
suan sui sun suo ta tai tan tang tao te teng ti tian tiao tie ting tong tou tu tuan tui
tun tuo wa wai wan wang wei wen weng wo wu xi xia xian xiang xiao xie xin xing xiong xiu
xu xuan xue xun ya yan yang yao ye yi yin ying yong you yu yuan yue yun za zai zan zang
zao ze zei zen zeng zha zhai zhan zhang zhao zhe zhei zhen zheng zhi zhong zhou zhu zhua
zhuai zhuan zhuang zhui zhun zhuo zi zong zou zu zuan zui zun zuo
""".split " "

circle_arrows = ["→","↗","↑","↖","←","↙","↓","↘"]

get_syllable_circle_arrow = (s) ->
  s = s.replace(/[0-5]$/, "")
  i = all_syllables.indexOf s
  circle_arrows[(Math.round(8 * i / all_syllables.length)) % 8]

class_for_tone = (tone) -> "tone#{tone}"

build_prelearn = ->
  prelearn = read_csv_file("/home/nonroot/chinese/1/lists/prelearn.csv").map (a) -> [a[0], a[1]]
  groups = {}
  for a in prelearn
    object_array_add groups, a[1], a[0]
  result = []
  for k, v of groups
    arrow = get_syllable_circle_arrow k
    result.push [k + arrow, v.join("")]
  result

build_pinyin_sets = ->
  rows = get_characters_by_pinyin_rows()
  flat = ([a[0], a[1].join("")] for a in rows)
  by_count = flat.slice().sort (a, b) -> a[1].length - b[1].length
  [flat, by_count]

build_contained = (tone_index, pinyin_index) ->
  rows = get_characters_contained_rows()
  ([a[0], ([c, tone_index[c]] for c in a[1])] for a in rows)

render_row = ([label, data]) ->
  if typeof data is "string"
    "<b><b>#{label}</b><b>#{data}</b></b>"
  else
    "<b><b>#{label}</b><b>#{data}</b></b>"
    #chars = data.map ([c, t]) -> "<b class=\"#{class_for_tone t}\">#{c}</b>"
    #"<b><b>#{label}</b><b>#{chars.join("")}</b></b>"

update_character_tables_html = (tables) ->
  nav_links = []
  i = 0
  make_table = (rows, name) ->
    nav_links.push "<a href=\"#\" data-target=\"#{i}\">#{name}</a>"
    i += 1
    "<div class=\"#{name}\">" + (rows.map render_row).join("\n") + "</div>"
  content = (make_table v, k for k, v of tables).join "\n"
  [content, nav_links.join("\n")]

get_characters_by_pinyin_rows_flat = ->
  result = []
  for a in get_characters_by_pinyin_rows()
    for b in a[1]
      result.push [a[0], b]
  result

update_character_tables = ->
  tone_index = get_character_tone_index()
  pinyin_index = get_character_pinyin_index()
  [pinyin, pinyin_by_count] = build_pinyin_sets()
  prelearn = build_prelearn()
  contained = build_contained tone_index, pinyin_index
  tables =
    pinyin: pinyin
    contained: contained
    pinyin_by_count: pinyin_by_count
    prelearn: prelearn
  [content, nav_links] = update_character_tables_html tables
  font = read_text_file "src/NotoSansSC-Light.ttf.base64"
  html = read_text_file "src/character-tables-template.html"
  html = replace_placeholders html, {font, content, nav_links}
  #fs.writeFileSync "compiled/character-tables.html", html
  for key, value of tables
    tables[key] = (b.reverse() for b in value)
  prelearn2 = []
  for a in prelearn
    for b in split_chars a[0]
      prelearn2.push [b, a[1]]
  write_csv_file "tmp/prelearn.csv", prelearn2
  by_pinyin = get_characters_by_pinyin_rows_flat()
  by_syllable = []
  for a in by_pinyin
    syllable = a[0].replace /[0-5]$/, ""
    by_syllable.push [syllable, a[1], a[0]]
  write_csv_file "data/gridlearner/characters-by-syllable.csv", by_syllable

update_characters_by_pinyin_vertical = (rows) ->
  vertical_rows = format_lines_vertically rows
  fs.writeFileSync "data/characters-by-pinyin-by-count-vertical.csv", vertical_rows.join "\n"

update_characters_by_pinyin = () ->
  by_pinyin = {}
  chars = get_all_characters_with_pinyin().filter((a) -> !a[1].endsWith("5"))
  chars.forEach (a) -> object_array_add by_pinyin, a[1], a[0]
  rows = Object.keys(by_pinyin).map (a) -> [a, by_pinyin[a].join("")]
  rows = rows.sort (a, b) -> a[0].localeCompare(b[0]) || b[1].length - a[1].length
  write_csv_file "data/characters-by-pinyin.csv", rows
  rows = rows.sort (a, b) -> b[1].length - a[1].length || a[0].localeCompare(b[0])
  write_csv_file "data/characters-by-pinyin-by-count.csv", rows
  #rows = rows.filter (a) -> a[1].length < 4
  #rows = rows.sort (b, a) -> b[1].length - a[1].length || a[0].localeCompare(b[0])
  #write_csv_file "data/characters-by-pinyin-rare.csv", rows
  rare_rows = []
  for p in Object.keys(by_pinyin)
    if by_pinyin[p].length < 3
      for c in by_pinyin[p]
        rare_rows.push [c, p]
  rare_rows = rare_rows.sort (a, b) -> a[1].localeCompare(b[1]) || a[0].localeCompare(b[0])
  write_csv_file "data/characters-pinyin-rare.csv", rare_rows

sort_by_array_with_index = (a, sorting, index) ->
  a.sort (a, b) -> sorting.indexOf(a[index]) - sorting.indexOf(b[index])

index_key_value = (a, key_key, value_key) ->
  b = {}
  a.forEach (a) -> b[a[key_key]] = a[value_key]
  b

get_compositions_index = ->
  decompositions = read_csv_file "data/characters-strokes-decomposition.csv"
  decompositions = ([a, c?.split("") || []] for [a, b, c] in decompositions)
  compositions = {}
  for a in decompositions
    [char, a] = a
    for component in a
      c = compositions[component]
      if c
        unless c.includes char
          c.push char
          compositions[component] = c
      else compositions[component] = [char]
  frequency_sorter = sort_by_character_f get_character_frequency_index()
  for a, b of compositions
    compositions[a] = b.sort frequency_sorter
  compositions

get_full_compositions_index = ->
  full_decompositions = get_full_decompositions()
  compositions = {}
  for [char, components] in full_decompositions
    for component in components
      c = compositions[component]
      if c
        unless c.includes char
          c.push char
      else
        compositions[component] = [char]
  frequency_sorter = sort_by_character_f get_character_frequency_index()
  for component, chars of compositions
    compositions[component] = chars.sort frequency_sorter
  compositions

get_decompositions_index = () -> index_key_value read_csv_file("data/characters-strokes-decomposition.csv"), 0, 2

get_full_decompositions = () ->
  # also include decompositions of components per entry
  decompositions_index = get_decompositions_index()
  decompose = (a) ->
    parts = decompositions_index[a]
    if parts
      parts = [...parts]
      [a].concat(parts, parts.map(decompose))
    else [a]
  Object.keys(decompositions_index).map (a) ->
    parts = decompose(a).flat(Infinity)
    [parts[0], delete_duplicates(parts.slice(1))]
    #[parts[0], parts.slice(1)]

get_full_decompositions_index = () -> index_key_value get_full_decompositions(), 0, 1

get_stroke_count_index = (a) ->
  data = read_csv_file("data/characters-strokes-decomposition.csv")
  result = {}
  result[a[0]] = parseInt a[1] for a in data
  result

get_character_reading_count_index = () ->
  result = {}
  read_csv_file("data/characters-pinyin-count.csv").forEach (a) -> result[a[0] + a[1]] = parseInt a[2]
  result

get_character_syllables_tones_count_index = () ->
  result = {}
  read_csv_file("data/syllables-tones-character-counts.csv").forEach (a) -> result[a[0]] = parseInt a[1]
  result

get_character_example_words_f = () ->
  dictionary = dictionary_index_word_pinyin_f 0, 1
  words = read_csv_file "data/words-by-frequency-with-pinyin-translation.csv"
  (char, pinyin, frequency_limit) ->
    char_word = words.find((b) -> b[0] is char)
    unless char_word
      char_word = dictionary char, pinyin
      char_word = char_word[0] if char_word
    char_words = if char_word then [char_word] else []
    char_words.concat words.filter (b, i) -> b[0].includes(char) && b[0] != char && (!frequency_limit || i < frequency_limit)

sort_standard_character_readings = () ->
  reading_count_index = get_character_reading_count_index()
  path = "data/table-of-general-standard-chinese-characters.csv"
  rows = read_csv_file(path).map (a) ->
    char = a[0]
    pinyin = a[1].split(", ").map (a) -> if a.match(/[0-5]$/) then a else a + "5"
    pinyin = pinyin.sort (a, b) -> (reading_count_index[char + b] || 0) - (reading_count_index[char + a] || 0)
    a[1] = pinyin.join ", "
    a
  write_csv_file path, rows

add_sort_field = (rows) ->
  a.push i for a, i in rows
  rows

update_pinyin_learning = () ->
  # pinyin, word_choices -> word, translation
  options =
    words_per_char: 3
    word_choices: 5
  character_frequency_index = get_character_frequency_index()
  get_character_example_words = get_character_example_words_f()
  standard_chars = read_csv_file("data/table-of-general-standard-chinese-characters.csv")
  chars = standard_chars.map (a) -> [a[0], a[1].split(", ")[0]]
  chars = sort_by_character_frequency character_frequency_index, 0, chars
  rows = for a in chars
    a = get_character_example_words(a[0], a[1])
    if 1 < a.length then a = a.slice 1, options.words_per_char + 1
    [b[1], b[0], b[2]] for b in a
  rows = rows.flat 1
  rows = array_deduplicate_key rows, (a) -> a[1]
  add_word_choices = (rows) ->
    rows.map (a) ->
      tries = 30
      alternatives = [a[1]]
      while tries && alternatives.length < options.word_choices
        alternative = random_element rows
        if a[1].length == alternative[1].length && a[0] != alternative[0] && !alternatives.includes(alternative[1])
          alternatives.push alternative[1]
        tries -= 1
      a.push array_shuffle(alternatives).join(" ")
      a
  rows = add_sort_field add_word_choices rows
  write_csv_file "data/pinyin-learning.csv", rows

get_char_pinyin = do ->
  all_chars_and_pinyin = get_all_characters_with_pinyin()
  char_pinyin_index = index_key_value all_chars_and_pinyin, 0, 1
  dictionary = dictionary_index_word_f 0
  (a) ->
    b = dictionary a
    return b[0][1] if b && b.length
    b = char_pinyin_index[a]
    return b if b

get_char_decompositions = do ->
  decompositions = get_full_decompositions_index()
  strokes = get_stroke_count_index()
  (a) ->
    b = decompositions[a]
    return [] unless b
    b = b.filter((a) -> !strokes[a] || strokes[a] > 1)
    b.map((a) -> [a, get_char_pinyin(a)]).filter (a) -> a[1]

characters_add_learning_data = (rows) -> # [[character, pinyin], ...] -> [array, ...]
  reading_count_index = get_character_reading_count_index()
  character_by_reading_index = get_character_by_reading_index()
  get_character_example_words = get_character_example_words_f()
  rows = array_deduplicate_key(rows, (a) -> a[0])
  syllables = delete_duplicates rows.map((a) -> a[1].split(", ")).flat()
  add_same_reading_characters = (rows) ->
    max_same_reading_characters = 24
    rows.map (a) ->
      b = (character_by_reading_index[a[1]] or []).slice(0, max_same_reading_characters)
      b = b.filter (b) -> a[0] != b
      a.push b.join ""
      a
  add_syllable_arrows = (rows) ->
    rows.map (a) ->
      arrow = get_syllable_circle_arrow a[1]
      a.push arrow
      a
  add_contained_characters = (rows) ->
    rows.map (a) ->
      b = get_char_decompositions a[0]
      c = b.map((c) -> c.join(" ")).join(", ")
      a.push c
      a
  add_example_words = (rows) ->
    rows.map (a) ->
      words = get_character_example_words(a[0], a[1])
      a.push(words.slice(1, 5).map((b) -> b[0]).join(" "))
      a.push(words.slice(0, 5).map((b) -> b.join(" ")).join("\n"))
      a
  rows = add_contained_characters rows
  rows = add_same_reading_characters(rows)
  rows = add_sort_field rows
  rows = add_syllable_arrows rows
  rows = add_example_words rows
  rows

fix_dependency_order = (items, char_key) ->
  di = get_full_decompositions_index()
  pm = {}
  for i, a of items
    pm[a[char_key]] = i
  i = 0
  while i < items.length
    c = items[i][char_key]
    deps = di[c] or []
    for d in deps
      j = pm[d]
      if j? and j > i
        dep = items.splice(j, 1)[0]
        items.splice(i, 0, dep)
        for k in [Math.min(i, j)..Math.max(i, j)]
          pm[items[k][char_key]] = k
        # stay at same i to recheck moved-in deps
        i -= 1
        break
    i += 1
  items

# test examples: 刀 < 那
sort_by_frequency_f = (char_key) ->
  fi = get_character_frequency_index()
  (a, b) -> fi[a[char_key]] - fi[b[char_key]]

sort_by_frequency = (data, char_key) -> data.sort sort_by_frequency_f char_key

sort_by_frequency_and_dependency = (data, char_key) ->
  data = data.sort sort_by_frequency_f char_key
  data = fix_dependency_order data, char_key
  data

update_characters_learning = ->
  rows = get_all_standard_characters_with_pinyin()
  rows = sort_by_frequency_and_dependency rows, 0
  rows = characters_add_learning_data rows
  write_csv_file "data/characters-learning.csv", rows
  rows = ([i + 1, a[0], a[1], a[5], a[3]] for a, i in rows)
  write_csv_file "data/characters-learning-reduced.csv", rows

update_syllables_character_count = () ->
  # number of characters with the same reading
  chars = read_csv_file("data/characters-by-pinyin.csv").map (a) -> [a[0], a[1].length]
  chars_without_tones = chars.map (a) -> [a[0].replace(/[0-5]/g, ""), a[1]]
  get_data = (chars) ->
    counts = {}
    chars.forEach (a) ->
      if counts[a[0]] then counts[a[0]] += a[1]
      else counts[a[0]] = a[1]
    chars = chars.map (a) -> a[0]
    chars = delete_duplicates_stable chars
    chars.map((a) -> [a, counts[a]]).sort (a, b) -> b[1] - a[1]
  write_csv_file "data/syllables-tones-character-counts.csv", get_data(chars)
  write_csv_file "data/syllables-character-counts.csv", get_data(chars_without_tones)

grade_text_files = (paths) ->
  paths.forEach (a) -> console.log grade_text(read_text_file(a)) + " " + node_path.basename(a)

grade_text = (a) ->
  chars = delete_duplicates a.match hanzi_regexp
  frequency_index = get_character_frequency_index()
  all_chars_count = Object.keys(frequency_index).length
  frequencies = chars.map((a) -> frequency_index[a] || all_chars_count).sort((a, b) -> a - b)
  count_score = chars.length / all_chars_count
  rarity_score = median(frequencies.splice(-10)) / all_chars_count
  Math.max 1, Math.round(10 * (count_score + rarity_score))

character_exclusions_gridlearner = "灬罒彳𠂉⺈辶卝埶冃丏卝宀冖亠䒑丅丷一亅⿻㇀乚丨丿⿰�丶㇒㇏⿹乛㇓㇈⿸乀㇍⿺㇋㇂㇊丆⺊ユ⿾⿶⿵⿴⿲コ凵⿳⿽㇌⿷囗㇎㇅㇄厸䶹乛㇓㇈㇅㇄㇈一亅㇀ 乚丨丿丶㇒㇏㇇乛㇓乀㇍㇂㇊丆二⺊卜十冂ユコ㇄㇅㇎㇌乜㇋厸丫䶹凵囗乁"
character_exclusions = "⿱丅丷一亅⿻㇀乚丨丿⿰�丶㇒㇏⿹乛㇓㇈⿸乀㇍⿺㇋㇂㇊丆⺊ユ⿾⿶⿵⿴⿲コ凵⿳⿽㇌⿷囗㇎㇅㇄厸䶹乛㇓㇈㇅㇄㇈一亅㇀ 乚丨丿丶㇒㇏㇇乛㇓乀㇍㇂㇊丆二⺊卜十冂ユコ㇄㇅㇎㇌乜㇋厸丫䶹凵囗乁"

get_characters_contained_pinyin_rows = (exclusions = []) ->
  pinyin_index = get_character_pinyin_index()
  compositions_index = get_full_compositions_index()
  edges = []
  has_parent = new Set()
  for parent_char of compositions_index
    continue unless parent_char.match hanzi_regexp
    continue if exclusions.includes parent_char
    continue unless pinyin_index[parent_char]
    for child_char in compositions_index[parent_char] when child_char.match hanzi_regexp
      continue unless pinyin_index[child_char]
      edges.push [parent_char, child_char, pinyin_index[child_char]]
      has_parent.add child_char
  for parent_char of compositions_index when not has_parent.has parent_char
    continue unless parent_char.match hanzi_regexp
    continue if exclusions.includes parent_char
    continue unless pinyin_index[parent_char]
    edges.push [null, parent_char, pinyin_index[parent_char]]
  edges

get_characters_contained_rows = (exclusions = character_exclusions) ->
  compositions = get_compositions_index()
  rows = []
  for char of compositions when char.match(hanzi_regexp) and not exclusions.includes(char)
    rows.push [char, compositions[char]]
  rows.sort (a, b) -> a[1].length - b[1].length

update_characters_contained = ->
  rows = get_characters_contained_pinyin_rows()
  rows_gridlearner = get_characters_contained_pinyin_rows character_exclusions_gridlearner
  for a in rows_gridlearner
    continue unless a[2]
    a[2] = a[2] + get_syllable_circle_arrow a[2]
  write_csv_file "data/gridlearner/characters-by-component.csv", rows_gridlearner
  rows = get_characters_contained_rows character_exclusions
  lines = (a[0] + " " + a[1].join("") for a in rows).join "\n"
  fs.writeFileSync "data/characters-contained.txt", lines
  rows = (a[0] + " " + get_char_decompositions(a[0]).join("") for a in rows)
  fs.writeFileSync "data/characters-containing.txt", rows.join "\n"

update_characters_data = ->
  graphics_data = JSON.parse read_text_file "data/characters-svg-animcjk-simple.json"
  character_data = read_csv_file "data/characters-strokes-decomposition.csv"
  compositions_index = get_compositions_index()
  dictionary_lookup = dictionary_index_word_f 0
  character_frequency_index = get_character_frequency_index()
  result = []
  for a, i in character_data
    [char, strokes, decomposition] = a
    strokes = parseInt strokes, 10
    svg_paths = graphics_data[char] || ""
    compositions = compositions_index[char] || []
    entries = dictionary_lookup char
    if entries and entries.length
      entry = entries[0]
      pinyin = entry[1]
    else pinyin = ""
    result.push [char, strokes, pinyin, decomposition || "", compositions.join(""), svg_paths]
  result = sort_by_character_frequency character_frequency_index, 0, result
  fs.writeFileSync "data/characters-svg.json", JSON.stringify result

get_common_words_per_character = (max_words_per_char, max_frequency) ->
  character_frequency_index = get_character_frequency_index()
  get_character_example_words = get_character_example_words_f()
  standard_chars = read_csv_file "data/table-of-general-standard-chinese-characters.csv"
  chars = standard_chars.map (a) -> [a[0], a[1].split(", ")[0]]
  chars = sort_by_character_frequency character_frequency_index, 0, chars
  rows = for a in chars
    a = get_character_example_words a[0], a[1], max_frequency
    if 1 < a.length then a = a.slice 0, max_words_per_char
    a
  rows = rows.flat 1
  rows = array_deduplicate_key rows, (a) -> a[1]

is_file = (path) -> fs.statSync(path).isFile()
strip_extensions = (filename) -> filename.replace /\.[^.]+$/, ''

update_lists = (paths) ->
  nav_links = []
  paths = (a for a in paths when is_file a)
  content = for path, i in paths
    rows = read_csv_file path
    parts = for row in rows
      [head, tail...] = row
      tail = tail.join " "
      """
      <b><b>#{head}</b><b>#{tail}</b></b>
      """
    label = strip_extensions node_path.basename path
    nav_links.push """
       <a href="#" data-target="#{i}">#{label}</a>
    """
    "<div>" + parts.join("\n") + "</div>"
  content = content.join "\n"
  nav_links = nav_links.join "\n"
  font = read_text_file "src/NotoSansSC-Light.ttf.base64"
  html = read_text_file "src/lists-template.html"
  html = replace_placeholders html, {font, content, nav_links}
  fs.writeFileSync "tmp/lists.html", html

iconv = require "iconv-lite"

update_character_frequency = ->
  buf = fs.readFileSync "/tmp/SUBTLEX-CH-CHR"
  text = iconv.decode buf, "gb2312"
  lines = text.split "\n"
  chars = []
  for line in lines when line.trim() and not line.startsWith("Character") and not line.startsWith("Total")
    parts = line.trim().split /\s+/
    chr = parts[0]
    if chr.length is 1
      chars.push chr
  fs.writeFileSync "data/characters-by-frequency.txt", chars.join ""

update_word_frequency = ->
  buf = fs.readFileSync "/tmp/SUBTLEX-CH-WF"
  text = iconv.decode buf, "gb2312"
  lines = text.split "\n"
  words = []
  for line in lines when line.trim() and not line.startsWith("Word")
    parts = line.trim().split /\s+/
    word = parts[0]
    continue unless word.match /[\u4e00-\u9fff]/  # skip PUA and non-CJK
    words.push word
  fs.writeFileSync "data/words-by-frequency.txt", words.join "\n"

update_word_frequency_pinyin = ->
  words = array_from_newline_file "data/words-by-frequency.txt"
  dict = dictionary_index_word_f 0
  result = for word in words
    entry = dict word
    continue unless entry
    pinyin = entry[0][1]
    [word, pinyin]
  write_csv_file "data/words-by-frequency-with-pinyin.csv", result

get_practice_words = (num_attempts, max_freq) ->
  # get a list of the most frequent words where each character ideally appears
  #   only once and no word appears twice.
  word_frequency_index = get_word_frequency_index()
  characters = get_all_standard_characters()
  rows = read_csv_file "data/words-by-frequency-with-pinyin.csv"
  rows = rows.filter (a)->
    chars = split_chars a[0]
    chars.length > 1 && chars[0] != chars[1]
  candidate_words = {}
  for [w, p] in rows
    freq = word_frequency_index[w] || max_freq + 1
    continue if freq > max_freq
    for ch in split_chars w
      continue unless ch in characters
      (candidate_words[ch] ?= []).push [w,p,freq]
  characters = characters.filter (ch)-> candidate_words[ch]?
  for ch in characters
    candidate_words[ch].sort (a,b)-> a[2] - b[2]
  best_total_cost = Infinity
  best_assign = null
  for attempt in [0...num_attempts]
    order = array_shuffle characters.slice()
    counts = {}
    used_words = {}
    assign = {}
    run_cost = 0
    for ch in order
      opts = candidate_words[ch]
      best_score = Infinity
      chosen = null
      for [w,p,freq] in opts when not used_words[w]
        score = sum(counts[c] || 0 for c in w) + freq
        if score < best_score or (score is best_score and Math.random() < 0.5)
          best_score = score
          chosen = [w,p,freq]
      continue unless chosen?
      assign[ch] = chosen
      used_words[chosen[0]] = true
      counts[c] = (counts[c] || 0) + 1 for c in chosen[0]
      run_cost += best_score
    if run_cost < best_total_cost
      best_total_cost = run_cost
      best_assign = assign
  words = ([x[0],x[1]] for ch,x of best_assign)
  sort_by_word_frequency word_frequency_index, 0, words

update_practice_words = ->
  rows = get_practice_words 1000, Infinity
  write_csv_file "data/practice-words.csv", rows

update_gridlearner_data = ->
  chars = get_all_characters_with_pinyin()
  batch_size = 300
  get_batch_index = (i) -> (1 + i / batch_size).toString().padStart 2, "0"
  for i in [0...chars.length] by batch_size
    data = ([a[0], a[1]] for a in chars[i...i + batch_size])
    ii = get_batch_index i
    write_csv_file "data/gridlearner/characters-pinyin-#{ii}.dsv", data

update_characters_series = ->
  rows = read_csv_file "data/gridlearner/characters-by-component.csv"
  graph = {}
  for [p,c] in rows
    object_array_add graph, p, c
  max_start_degree = 30
  memo = {}
  longest = (n) ->
    return memo[n] if memo[n]?
    kids = graph[n] or []
    return memo[n] = [[n]] unless kids.length
    memo[n] = ( [n].concat longest(k).reduce ((a,b)-> if b.length>a.length then b else a) ) for k in kids
  nodes = delete_duplicates_stable (rows.map((r)->r[0]).concat rows.map((r)->r[1]))
  chains = []
  for n in nodes when (graph[n]?.length||0) and graph[n].length <= max_start_degree
    chains = chains.concat longest n
  seen = new Set()
  uniq = []
  for ch in chains when ch.length > 2
    id = ch.join ""
    continue if seen.has id
    uniq.push ch
    seen.add id
  sub = (a,b)-> b.join("").includes a.join("")
  uniq = uniq.filter (c)-> not uniq.some (d)-> d isnt c and d.length>c.length and sub c,d
  uniq = uniq.sort (a,b)-> b.length - a.length
  fs.writeFileSync "data/characters-series.txt", uniq.map((c)->c.join "").join "\n"

similar_initial = (s1, s2) ->
  pairs =
    c: "z", z: "c",
    j: "q", q: "j",
    k: "g", g: "k"
  # quick-n-dirty initial extractor good enough for the pairs above
  initial = (s) ->
    if s.startsWith("zh") or s.startsWith("ch") or s.startsWith("sh")
      s.slice 0, 2
    else s[0]
  i1 = initial s1
  i2 = initial s2
  r1 = s1.slice i1.length
  r2 = s2.slice i2.length
  (pairs[i1] is i2) and (r1 is r2)

update_characters_links = ->
  pinyin_index = get_character_pinyin_index()   # {char → "xx4"}
  tone_index   = get_character_tone_index()     # {char → 4}
  rows         = read_csv_file "data/gridlearner/characters-by-component.csv"
  by_component = {}
  rows.forEach ([component, carrier]) ->
    return unless component and carrier
    object_array_add by_component, component, carrier
  output_rows  = []
  for comp_char, carriers of by_component
    base_py = pinyin_index[comp_char]
    continue unless base_py          # skip if the component itself lacks a reading
    base_py      = base_py.split(",")[0]
    base_syl     = base_py.replace /[0-5]$/, ""
    base_tone    = parseInt base_py.slice(-1), 10
    tone_syll    = []
    tone_only    = []
    syl_only     = []
    init_links   = []
    carriers.forEach (c) ->
      return if c is comp_char
      cp = pinyin_index[c]
      return unless cp
      cp     = cp.split(",")[0]
      c_syl  = cp.replace /[0-5]$/, ""
      c_tone = parseInt cp.slice(-1), 10
      if cp is base_py then tone_syll.push c
      else if c_tone is base_tone then tone_only.push c
      else if c_syl is base_syl then syl_only.push c
      else if similar_initial base_syl, c_syl then init_links.push c
    dedup = delete_duplicates_stable
    [tone_syll, tone_only, syl_only, init_links] =
      (dedup lst for lst in [tone_syll, tone_only, syl_only, init_links])
    if tone_syll.length or tone_only.length or syl_only.length or init_links.length
      output_rows.push [
        comp_char,
        tone_syll.join(""),
        syl_only.join(""),
        tone_only.join(""),
        init_links.join("")
      ]
  write_csv_file "data/character-links.csv", output_rows

run = ->
  update_characters_series()
  #update_characters_links()
  #find_longest_containment_chains()
  #collect_characters_by_syllable_containment()

module.exports = {
  read_text_file
  get_characters_by_pinyin_rows
  clean_frequency_list
  replace_placeholders
  object_array_add
  get_all_characters_with_pinyin
  update_dictionary
  update_characters_data
  traditional_to_simplified
  pinyin_to_hanzi
  hanzi_to_pinyin
  mark_to_number
  update_characters_by_pinyin
  update_characters_learning
  update_pinyin_learning
  grade_text
  grade_text_files
  run
  update_lists
}