# extract information from the jmdict xml file and save as json fs = require "fs" parse_xml = require("xml2js").parseString wanakana = require "wanakana" object_array_add = (object, key, value) -> if object[key] then object[key].push value else object[key] = [value] array_contains_any = (a, b) -> b.some (b) -> a.includes(b) find_reading = (r_ele, frequency_tags) -> if frequency_tags.length a = r_ele.find (a) -> a.RE_PRI and array_contains_any(a.RE_PRI, frequency_tags) a and a.REB[0] else r_ele[0].REB[0] update_json = (config) -> # convert misc tags to the format used in jmdict exclusions = config.misc_tag_exclusions.map (a) -> "&#{a};" # parse xml to an object xml = fs.readFileSync config.jmdict_path parse_xml xml, {strict: false}, (error, jmdict) -> if error console.log error return result = {} # remove unneeded information jmdict.JMDICT.ENTRY.forEach (entry) -> # ignore words without kanji # take the first writing and select a common reading if entry.K_ELE word = entry.K_ELE[0].KEB[0] else return if config.only_words_with_kanji word = entry.R_ELE[0].REB[0] reading = find_reading entry.R_ELE, config.frequency_tags return unless reading reading = wanakana.toRomaji reading # select meanings translations = [] entry.SENSE.forEach (a) -> return unless translations.length < config.translations_limit unless a.MISC and a.MISC.some (a) -> exclusions.includes(a) b = a.GLOSS.map (a) -> if Array.isArray a then a.join("; ") if "string" == typeof(a) then a else null b = b.filter (a) -> a translations.push b.join(", ") if b.length return if 0 == translations.length object_array_add result, word, [reading, translations] fs.writeFileSync config.output_path, JSON.stringify result module.exports = update_json: update_json