Best Python code snippet using nose
template_odt.py
Source:template_odt.py
...757 def _expand_links(self, matches):758 (source, label, external) = self._process_link(matches)759 if(source.startswith("#")):760 temp = source[1:]761 if(self.m_engine.is_wiki_word(temp)):762 return temp763 # DEBUG BRAD: This is a temporary hack to get links of the format -> to work764 source = re.sub(".*?#(.*)", "\\1", source)765 label = re.sub(".*?#(.*)", "\\1", label)766 #print "SOURCE = %s, LABEL = %s" % (source, label)767 #source.replace("->", "")768 #label = label.replace("->", "")769 # Unconvert any XML in case it has already been770 # converted by format_text()771 label = self.unxmlize(label)772 # Now make the label XML safe again773 label = self.xmlize(label)774 if(source[0:4] == "http" or external == True):775 if(source[0:4] != "http"):...
build_union_db.py
Source:build_union_db.py
1#! /usr/bin/python32# -*- coding: utf-8 -*-3#--------------------------------------------------------------------------------------------------4# Script to build a union database by merging TSV dictionaries5#6# Usage:7# build_union_db.py [--output str] [--core str] [--gross str] [--top str] [--slim str]8# [--phrase_prob str] [--tran_prob str] [--tran_aux str] [--tran_aux_last str]9# [--rev_prob str] [--cooc_prob str] [--aoa str] [--keyword str] [--min_prob str]10# [--quiet] inputs...11# (An input specified as "label:tsv_file".12#13# Example:14# ./build_union_db.py --output union-body.tkh \15# --phrase_prob enwiki-phrase-prob.tkh --tran_prob tran-prob.tkh \16# --tran_aux dict1.tsv,dict2.tsv --rev_prob jawiki-word-prob.tkh \17# --cooc_prob enwiki-cooc-prob.tkh --min_prob we:0.00001 \18# wj:wiktionary-ja.tsv wn:wordnet.tsv we:wiktionary-en.tsv19#20# Copyright 2020 Google LLC21# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file22# except in compliance with the License. You may obtain a copy of the License at23# https://www.apache.org/licenses/LICENSE-2.024# Unless required by applicable law or agreed to in writing, software distributed under the25# License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,26# either express or implied. See the License for the specific language governing permissions27# and limitations under the License.28#--------------------------------------------------------------------------------------------------29import collections30import json31import logging32import math33import operator34import os35import regex36import sys37import time38import tkrzw39import tkrzw_dict40import tkrzw_pron_util41import tkrzw_tokenizer42import unicodedata43logger = tkrzw_dict.GetLogger()44poses = ("noun", "verb", "adjective", "adverb",45 "pronoun", "auxverb", "preposition", "determiner", "article",46 "interjection", "conjunction", "prefix", "suffix",47 "abbreviation", "phrase", "misc")48inflection_names = ("noun_plural","verb_singular", "verb_present_participle",49 "verb_past", "verb_past_participle",50 "adjective_comparative", "adjective_superlative",51 "adverb_comparative", "adverb_superlative")52etymology_names = ("etymology_prefix", "etymology_core", "etymology_suffix")53top_names = ("pronunciation",) + inflection_names + etymology_names54rel_weights = {"synonym": 1.0,55 "hypernym": 0.9,56 "hyponym": 0.8,57 "antonym": 0.2,58 "derivative": 0.7,59 "relation": 0.5}60noun_suffixes = [61 "es", "s", "ment", "age", "ics", "ness", "ity", "ism", "or", "er", "ist", "t", "pt", "th",62 "ian", "ee", "tion", "sion", "ty", "ance", "ence", "ency", "cy", "ry", "ary", "ery", "ory",63 "al", "age", "dom", "hood", "ship", "nomy", "ing", "ication", "icator", "ce", "se", "son",64 "iation", "ant", "faction", "ture", "sure", "nance", "y", "ess",65]66verb_suffixes = [67 "ify", "en", "ize", "ise", "fy", "ate", "age", "e",68]69adjective_suffixes = [70 "some", "able", "ible", "ic", "ical", "ial", "ive", "ful", "less", "ly", "ous", "y",71 "tic", "ine", "ised", "ing", "ed", "ish", "al", "ual", "icable", "er", "est", "ent", "ific",72 "ative", "tative", "ant", "ary",73]74adverb_suffixes = [75 "ly",76]77particles = {78 "aback", "about", "above", "abroad", "across", "after", "against", "ahead", "along",79 "amid", "among", "apart", "around", "as", "at", "away", "back", "before", "behind",80 "below", "beneath", "between", "beside", "beyond", "by", "despite", "during", "down",81 "except", "for", "forth", "from", "in", "inside", "into", "near", "of", "off", "on",82 "onto", "out", "outside", "over", "per", "re", "since", "than", "through", "throughout",83 "till", "to", "together", "toward", "under", "until", "up", "upon", "with", "within",84 "without", "via",85}86misc_stop_words = {87 "the", "a", "an", "I", "my", "me", "mine", "you", "your", "yours", "he", "his", "him",88 "she", "her", "hers", "it", "its", "they", "their", "them", "theirs",89 "we", "our", "us", "ours", "some", "any", "one", "someone", "something",90 "myself", "yourself", "yourselves", "himself", "herself", "itself", "themselves",91 "who", "whom", "whose", "what", "where", "when", "why", "how", "and", "but", "not", "no",92 "never", "ever", "time", "place", "people", "person", "this", "these", "that", "those",93 "other", "another", "yes", "thou",94 "back", "much", "many", "more", "most", "good", "well", "better", "best", "all",95}96wiki_stop_words = {97 "wikipedia", "encyclopedia", "page", "pages", "edit", "edits", "comment", "comments",98}99no_parents = {100 "number", "ground", "red", "happen", "letter", "monitor", "feed", "winter", "brake",101 "partner", "sister", "environment", "moment", "gun", "shower", "trigger", "wound", "bound",102 "weed", "saw", "copper", "buffer", "lump", "wary", "stove", "doctor", "hinder",103 "tower", "poetry", "parity", "fell", "lay", "bit", "drug", "grass", "shore", "notice",104 "butter", "slang", "grope", "feces", "left", "former", "found", "every", "scheme",105 "evening", "architecture", "hat", "slice", "bite", "tender", "bully", "translate",106 "fence", "liver", "special", "specific", "species", "statistics", "mathematics", "caution",107 "span", "fleet", "language", "gripe", "dribble", "total", "error", "option", "important",108 "shine", "dental", "irony", "transplant", "chemistry", "physics", "grocery", "grade",109 "gutter", "dove", "weary", "queer", "shove", "buggy", "twine", "tier", "rung", "spat",110 "pang", "jibe", "pent", "lode", "gelt", "plant", "plane", "pants", "craze", "grove",111 "downy", "musty", "mangy", "moped", "caper", "balmy", "tinny", "induce", "treaty",112 "chili", "chilli", "chile", "castor", "landry", "start", "baby", "means", "transfer",113 "interior", "exterior", "rabbit", "stripe", "fairy", "shunt", "clove", "abode", "bends",114 "molt", "holler", "feudal", "bounce", "livery", "wan", "sod", "dug", "het", "gat",115 "cover", "book", "cause", "quality", "process", "provide", "entry", "specify", "morning",116 "guarantee", "listen", "identity", "clone", "impress", "belly", "mansion",117}118force_parents = {119 "upwards": "upward", "towards": "toward", "identify": "identity", "guaranty": "guarantee",120 "advice": "advise", "device": "devise", "practice": "practise", "morn": "morning",121 "approximately": "approximate", "invocation": "invoke", "spec": "specify",122 "prisoner": "prison", "emission": "emit", "omission": "omit", "transmission": "transmit",123 "fission": "fissure", "competitive": "compete", "competitor": "compete",124 "conservative": "conserve", "pronunciation": "pronounce", "revelation": "reveal",125 "possession": "possess", "schema": "scheme", "further": "far", "farther": "far",126 "conjunction": "conjunct", "conjunctive": "conjunct", "location": "locate",127 "conjugation": "conjugate", "conjugative": "conjugate", "installation": "install",128 "translation": "translate", "formation": "form", "variation": "vary",129 "importance": "important", "innovative": "innovate", "bated": "bate",130 "chemist": "chemistry", "chemical": "chemistry", "chem": "chemistry",131 "architect": "architecture", "grocer": "grocery", "critic": "critique",132 "chilly": "chill", "launder": "laundry", "tension": "tense", "revolution": "revolve",133 "sensitive": "sense", "mutation": "mutate", "mutant": "mutate", "fated": "fate",134 "apery": "ape", "dingy": "dinge", "precession": "precess", "expertise": "expert",135 "dramatic": "drama", "pic": "picture", "tragic": "tragedy", "manse": "mansion",136 "administrate": "administer", "administrative": "administrate", "inquiry": "inquire",137 "administration": "administrate", "administrator": "administrate", "diplomat": "diplomacy",138 "federal": "federation", "analysis": "analyze", "emphasis": "emphasize",139 "chlorine": "chloride", "recognition": "recognize", "opposite": "oppose", "opponent": "oppose",140 "response": "respond", "tolerant": "tolerate", "remainder": "remain",141 "differential": "different", "differentiate": "different", "failure": "fail",142 "explosive": "explode", "civilization": "civil", "civilize": "civil",143 "success": "succeed", "application": "apply", "therapeutic": "therapy",144 "medical": "medicine", "beneficial": "benefit", "pianist": "piano",145 "blonde": "blond", "classification": "classify", "classify": "class",146 "technique": "technical", "technology": "technique", "technician": "technical",147 "millionaire": "million", "billionaire": "billion", "cigarette": "cigar",148 "adhesion": "adhere", "adhesive": "adhere", "chaotic": "chaos", "disclosure": "disclose",149 "destruction": "destroy", "concession": "concede", "rental": "rent",150 "influential": "influence", "strategic": "strategy", "minimal": "minimum",151 "mini": "minimum", "triangular": "triangle", "rebellion": "rebel",152 "intent": "intend", "replica": "replicate", "timer": "time", "timed": "time",153 "sparkle": "spark", "consensus": "consent", "probably": "probable", "pleasant": "please",154 "philosopher": "philosophy", "radiate": "radius", "tutorial": "tutor",155 "terminal": "terminus", "terminate": "terminus", "grief": "grieve", "grievance": "grieve",156 "anime": "animate", "surgeon": "surgery", "partition": "part", "pretense": "pretend",157 "concept": "conceive", "conceptual": "conceive", "solidarity": "solid",158 "economic": "economy", "economist": "economy", "decisive": "decide",159 "offense": "offend", "offensive": "offend", "necessary": "necessity",160 "frequency": "frequent", "portrait": "portray", "digital": "digit",161 "storage": "store", "nearly": "near", "granny": "grandmother", "sorry": "sorrow",162 "modification": "modify", "characteristic": "character", "anxious": "anxiety",163 "quantify": "quantity", "qualify": "quality", "appendix": "append",164 "quantitative": "quantity", "qualitative": "quality", "supremacy": "supreme",165 "imaginary": "imagine", "imaginative": "imagine", "disastrous": "disaster",166 "systematic": "system", "absorption": "absorb", "disciple": "discipline",167 "favorite": "favor", "prescription": "prescribe", "dominant": "dominate",168 "relief": "relieve", "laughter": "laugh", "participant": "participate",169 "companion": "company", "circular": "circle", "synthetic": "synthesis",170 "choice": "choose", "entrance": "entry", "maintenance": "maintain",171 "social": "society", "substantial": "substance", "identification": "identify",172 "assumption": "assume", "poet": "poetry", "info": "information", "information": "inform",173 "visible": "vision", "realistic": "real", "consumption": "consume", "reception": "recept",174 "photo": "photograph", "demo": "demonstrate", "publish": "public",175 "volunteer": "voluntary", "politician": "politics", "rationale": "rational",176 "physician": "physic", "physicist": "physics", "spectral": "specter",177 "birdie": "bird", "distillate": "distill", "earnings": "earn", "chimp": "chimpanzee",178 "nutrient": "nutrition", "nutritive": "nutrition", "delicacy": "delicate",179 "suspicion": "suspect", "disbelief": "disbelieve", "provocative": "provoke",180 "irritant": "irritate", "displeasure": "displease", "erroneous": "error",181 "humility": "humiliate", "consequence": "consequent", "barbaric": "barbarian",182 "mystic": "mystery", "festive": "festival", "festal": "festival", "intimacy": "intimate",183 "respiratory": "respiration", "respirator": "respiration", "sarcastic": "sarcasm",184 "crucify": "crucifix", "crucifixion": "crucifix", "abdominal": "abdomen",185 "medial": "median", "bureaucrat": "bureau", "wholly": "whole", "consul": "consulate",186 "repetition": "repeat", "repetitive": "repeat", "conquest": "conquer", "cavern": "cave",187 "rubbish": "rubble", "flammable": "flame", "ignorant": "ignore", "solitude": "solitary",188 "curiosity": "curious", "exceptionally": "exceptional", "blotch": "blot", "suckle": "suck",189 "negligent": "neglect", "negligence": "neglect", "infamous": "infamy",190 "deception": "deceit", "deceit": "deceive", "deceptive": "deceive",191 "irritable": "irritate", "prevalent": "prevail", "accusatory": "accuse",192 "conspiracy": "conspire", "envelop": "envelope", "capacitance": "capacitor",193 "romantic": "romance", "perm": "permanent", "feminist": "feminine",194 "demolition": "demolish", "trivial": "trivia", "instantaneous": "instant",195 "expense": "expend", "expenditure": "expend", "memorize": "memory",196 "memo": "memorandum", "consortium": "consort", "medallion": "medal", "godless": "god",197 "abrasion": "abrase", "abrasive": "abrase", "atheist": "atheism", "reunion": "reunite",198 "kindergartner": "kindergarten", "duckling": "duck", "introductory": "introduce",199 "baptism": "baptize", "sled": "sledge", "bobsled": "bobsleigh",200 "tarp": "tarpaulin", "intricacy": "intricate", "reverberate": "reverb",201 "glacial": "glacier", "legislature": "legislate", "redemption": "redeem",202 "predominant": "predominate", "lull": "lullaby", "butt": "buttock", "comfy": "comfort",203 "verification": "verify", "spectacular": "spectacle", "applause": "applaud",204 "theoretical": "theory", "curvature": "curve", "simply": "simple", "cafe": "cafeteria",205 "discussion": "discuss", "comparable": "compare", "comparative": "compare",206 "hysteric": "hysteria", "partial": "part", "generosity": "generous", "maths": "math",207 "prophecy": "prophesy", "prophet": "prophecy", "satisfactory": "satisfy",208 "fulfillment": "fulfill", "sufficient": "suffice", "energetic": "energy",209 "cosmic": "cosmos", "petrol": "petroleum", "applicable": "apply", "splendid": "splendor",210 "reproductive": "reproduce", "apologetic": "apology", "nervous": "nerve",211 "metabolic": "metabolism", "potency": "potent", "impotency": "impotent", "penal": "penalty",212 "migratory": "migrate", "migrant": "migrate", "immigrant": "immigrate", "emigrant": "emigrate",213 "amphibious": "amphibian", "menstrual": "menstruation", "president": "preside",214 "receptionist": "reception", "reception": "receive", "receipt": "receive",215 "receptive": "receive", "remembrance": "remember", "heartbroken": "heartbreak",216 "residential": "residence", "residency": "residence", "resident": "residence",217 "preparatory": "prepare", "glamorous": "glamour", "defense": "defend",218 "cellular": "cell", "viscosity": "viscous", "rhino": "rhinoceros", "hippo": "hippopotamus",219 "ancestral": "ancestor", "negative": "negate", "bacteria": "bacterium",220 "registration": "register", "registry": "register", "inaugural": "inaugurate",221 "alkaline": "alkali", "humane": "human", "divisible": "divide", "capacity": "capable",222 "grandpa": "grandfather", "grandma": "grandmother", "nauseous": "nausea",223 "luncheon": "lunch", "conscientious": "conscience", "mandatory": "mandate",224 "cleric": "clergy", "corrosion": "corrode", "limo": "limousine", "descriptive": "describe",225 "inflammable": "inflame", "inflammation": "inflame", "tremble": "tremor",226 "enthusiast": "enthusiasm", "pussy": "puss", "considerate": "consider",227 "eternity": "eternal", "monstrous": "monster", "clarity": "clarify", "illiteracy": "illiterate",228 "clarification": "clarify", "muscular": "muscle", "furniture": "furnish",229 "perception": "perceive", "percept": "perceive", "sensory": "sense", "symptomatic": "symptom",230 "destination": "destine", "categorical": "category", "ascent": "ascend",231 "ingenuity": "ingenious", "invention": "invent", "gymnast": "gymnastic",232 "propel": "propulsion", "belief": "believe", "whimsy": "whim", "disciplinary": "discipline",233 "mischievous": "mischief", "crazy": "craze", "liquefy": "liquid", "delicacy": "delicate",234 "confectionery": "confection", "resilience": "resilient", "grad": "graduate",235 "therapist": "therapy", "perseverance": "persevere", "intro": "introduction",236 "abolition": "abolish", "reparation": "repair", "testify": "testimony", "sports": "sport",237 "disqualification": "disqualify", "rectangular": "rectangle", "metropolitan": "metropolis",238 "sportsmanship": "sportsman", "atheist": "atheism", "prognostic": "prognosis",239 "assurance": "assure", "insurance": "insure", "extent": "extend", "mineral": "mine",240 "fort": "fortress", "pharmaceutical": "pharmacy", "menstrual": "menstruum",241 "community": "commune", "communal": "commune", "data": "datum", "agenda": "agendum",242 "metric": "meter", "democrat": "democracy", "presumption": "presume", "shelf": "shelve",243 "solitaire": "solitary", "explanatory": "explain", "woolen": "wool", "planar": "plain",244 "northeastern": "northeast", "northwestern": "northwest",245 "southeastern": "southeast", "southwestern": "southwest",246 "eastward": "east", "westward": "west", "wintry": "winter",247}248class BuildUnionDBBatch:249 def __init__(self, input_confs, output_path, core_labels, full_def_labels, gross_labels,250 surfeit_labels, top_labels, slim_labels, tran_list_labels, supplement_labels,251 phrase_prob_path, tran_prob_path, tran_aux_paths, tran_aux_last_paths,252 rev_prob_path, cooc_prob_path, aoa_paths, keyword_path, min_prob_map):253 self.input_confs = input_confs254 self.output_path = output_path255 self.core_labels = core_labels256 self.full_def_labels = full_def_labels257 self.gross_labels = gross_labels258 self.surfeit_labels = surfeit_labels259 self.top_labels = top_labels260 self.slim_labels = slim_labels261 self.tran_list_labels = tran_list_labels262 self.supplement_labels = supplement_labels263 self.phrase_prob_path = phrase_prob_path264 self.tran_prob_path = tran_prob_path265 self.tran_aux_paths = tran_aux_paths266 self.tran_aux_last_paths = tran_aux_last_paths267 self.rev_prob_path = rev_prob_path268 self.cooc_prob_path = cooc_prob_path269 self.aoa_paths = aoa_paths270 self.keyword_path = keyword_path271 self.min_prob_map = min_prob_map272 self.tokenizer = tkrzw_tokenizer.Tokenizer()273 def Run(self):274 start_time = time.time()275 logger.info("Process started: input_confs={}, output_path={}".format(276 str(self.input_confs), self.output_path))277 word_dicts = []278 for label, input_path in self.input_confs:279 slim = label in self.slim_labels280 word_dict = self.ReadInput(input_path, slim)281 word_dicts.append((label, word_dict))282 aux_trans = {}283 for tran_aux_path in self.tran_aux_paths:284 if not tran_aux_path: continue285 self.ReadTranAuxTSV(tran_aux_path, aux_trans)286 aux_last_trans = {}287 for tran_aux_last_path in self.tran_aux_last_paths:288 if not tran_aux_last_path: continue289 self.ReadTranAuxTSV(tran_aux_last_path, aux_last_trans)290 raw_aoa_words = collections.defaultdict(list)291 for aoa_path in self.aoa_paths:292 if not aoa_path: continue293 self.ReadAOAWords(aoa_path, raw_aoa_words)294 aoa_words = {}295 for word, values in raw_aoa_words.items():296 aoa_words[word] = sum(values) / len(values)297 keywords = set()298 if self.keyword_path:299 self.ReadKeywords(self.keyword_path, keywords)300 self.SaveWords(word_dicts, aux_trans, aux_last_trans, aoa_words, keywords)301 logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))302 def NormalizeText(self, text):303 text = unicodedata.normalize('NFKC', text)304 text = regex.sub(r"[\u2018\u2019\u201A\u201B\u2758\u275B\u275C\u275F\uFF02]", "'", text)305 text = regex.sub(r"[\u201C\u201D\u201E\u201F]", '"', text)306 text = regex.sub(307 r"[\u00AD\u02D7\u2010\u2011\u2012\u2013\u2014\u2015\u2043\u2212\u2796\u2E3A\u2E3B" +308 r"\uFE58\uFE63\uFF0D]", "-", text)309 return text310 def ReadInput(self, input_path, slim):311 start_time = time.time()312 logger.info("Reading an input file: input_path={}".format(input_path))313 word_dict = collections.defaultdict(list)314 num_entries = 0315 with open(input_path) as input_file:316 for line in input_file:317 word = ""318 ipa = ""319 sampa = ""320 texts = []321 inflections = {}322 etymologies = {}323 alternatives = []324 mode = ""325 rel_words = {}326 for field in line.strip().split("\t"):327 columns = field.split("=", 1)328 if len(columns) < 2: continue329 name, value = columns330 value = self.NormalizeText(value)331 value = regex.sub(r"[\p{Z}\p{C}]+", " ", value).strip()332 if name == "word":333 word = value334 elif name == "pronunciation_ipa":335 ipa = value336 elif name == "pronunciation_sampa":337 sampa = value338 elif name.startswith("inflection_"):339 name = regex.sub(r"^[a-z]+_", "", name)340 inflections[name] = inflections.get(name) or value341 elif name.startswith("etymology_"):342 etymologies[name] = value343 elif name == "alternative":344 for alt_word in value.split(","):345 alt_word = alt_word.strip()346 if alt_word:347 alternatives.append(alt_word)348 elif name in poses:349 if slim:350 value = regex.sub(r" \[-+\] .*", "", value).strip()351 if value:352 texts.append((name, value))353 elif name in rel_weights:354 rel_words[name] = value355 elif name == "mode":356 mode = value357 if not ipa and sampa:358 ipa = tkrzw_pron_util.SampaToIPA(sampa)359 if not word or len(word) > 48:360 continue361 if ipa or texts or inflections or etymologies or alternatives:362 key = tkrzw_dict.NormalizeWord(word)363 entry = {"word": word}364 if ipa:365 entry["pronunciation"] = ipa366 for name, value in inflections.items():367 entry[name] = value368 for name, value in etymologies.items():369 entry[name] = value370 if alternatives:371 entry["alternative"] = alternatives372 entry["text"] = texts373 for rel_name, rel_value in rel_words.items():374 entry[rel_name] = rel_value375 if mode:376 key += "\t" + mode377 word_dict[key].append(entry)378 num_entries += 1379 if num_entries % 10000 == 0:380 logger.info("Reading an input: num_entries={}".format(num_entries))381 logger.info("Reading an input done: num_entries={}, elapsed_time={:.2f}s".format(382 num_entries, time.time() - start_time))383 return word_dict384 def ReadTranAuxTSV(self, input_path, aux_trans):385 start_time = time.time()386 logger.info("Reading a translation aux file: input_path={}".format(input_path))387 num_entries = 0388 with open(input_path) as input_file:389 for line in input_file:390 fields = line.strip().split("\t")391 if len(fields) < 2: continue392 word = self.NormalizeText(fields[0])393 values = aux_trans.get(word) or []394 uniq_trans = set()395 for tran in fields[1:]:396 tran = self.NormalizeText(tran)397 tran = regex.sub(r"[\p{Ps}\p{Pe}\p{C}]", "", tran)398 tran = regex.sub(r"[\p{Z}\p{C}]+", " ", tran).strip()399 norm_tran = tkrzw_dict.NormalizeWord(tran)400 if not tran or not norm_tran: continue401 if regex.search(r"\p{Latin}.*ã®.*(å½¢|åè©|ç´)", tran): continue402 if norm_tran in uniq_trans: continue403 uniq_trans.add(norm_tran)404 values.append(tran)405 aux_trans[word] = values406 num_entries += 1407 if num_entries % 10000 == 0:408 logger.info("Reading a translation aux file: num_entries={}".format(num_entries))409 logger.info("Reading a translation aux file: num_entries={}, elapsed_time={:.2f}s".format(410 num_entries, time.time() - start_time))411 def ReadAOAWords(self, input_path, aoa_words):412 start_time = time.time()413 logger.info("Reading a AOA file: input_path={}".format(input_path))414 num_entries = 0415 with open(input_path) as input_file:416 is_first = True417 for line in input_file:418 if is_first:419 is_first = False420 continue421 fields = line.strip().split(",")422 if len(fields) != 7: continue423 word = self.NormalizeText(fields[0]).strip()424 occur = fields[3]425 mean = fields[4]426 stddev = fields[5]427 if not word or not regex.fullmatch(r"[0-9.]+", mean): continue428 if not regex.fullmatch(r"[.0-9]+", occur): continue429 mean = float(mean)430 if regex.fullmatch(r"[0-9.]+", stddev):431 mean += float(stddev)432 else:433 mean += 3.0434 aoa_words[word].append(mean)435 num_entries += 1436 if num_entries % 10000 == 0:437 logger.info("Reading a AOA file: num_entries={}".format(num_entries))438 logger.info("Reading a translation aux file: num_entries={}, elapsed_time={:.2f}s".format(439 num_entries, time.time() - start_time))440 def ReadKeywords(self, input_path, keywords):441 start_time = time.time()442 logger.info("Reading a keyword file: input_path={}".format(input_path))443 num_entries = 0444 with open(input_path) as input_file:445 for line in input_file:446 keyword = self.NormalizeText(line).strip()447 keywords.add(keyword)448 num_entries += 1449 if num_entries % 10000 == 0:450 logger.info("Reading a keyword file: num_entries={}".format(num_entries))451 logger.info("Reading a translation aux file: num_entries={}, elapsed_time={:.2f}s".format(452 num_entries, time.time() - start_time))453 def SaveWords(self, word_dicts, aux_trans, aux_last_trans, aoa_words, keywords):454 logger.info("Preparing DBMs")455 phrase_prob_dbm = None456 if self.phrase_prob_path:457 phrase_prob_dbm = tkrzw.DBM()458 phrase_prob_dbm.Open(self.phrase_prob_path, False, dbm="HashDBM").OrDie()459 tran_prob_dbm = None460 if self.tran_prob_path:461 tran_prob_dbm = tkrzw.DBM()462 tran_prob_dbm.Open(self.tran_prob_path, False, dbm="HashDBM").OrDie()463 rev_prob_dbm = None464 if self.rev_prob_path:465 rev_prob_dbm = tkrzw.DBM()466 rev_prob_dbm.Open(self.rev_prob_path, False, dbm="HashDBM").OrDie()467 cooc_prob_dbm = None468 if self.cooc_prob_path:469 cooc_prob_dbm = tkrzw.DBM()470 cooc_prob_dbm.Open(self.cooc_prob_path, False, dbm="HashDBM").OrDie()471 start_time = time.time()472 logger.info("Extracting keys")473 keys = set()474 for label, word_dict in word_dicts:475 for key in word_dict.keys():476 if key.find("\t") >= 0: continue477 keys.add(key)478 logger.info("Extracting keys done: num_keys={}, elapsed_time={:.2f}s".format(479 len(keys), time.time() - start_time))480 start_time = time.time()481 logger.info("Indexing stems")482 stem_index = collections.defaultdict(list)483 for label, word_dict in word_dicts:484 if label in self.supplement_labels: continue485 for key in keys:486 for entry in word_dict[key]:487 word = entry["word"]488 if not regex.fullmatch("[a-z]+", word): continue489 stems = self.GetDerivativeStems(label, entry, word_dict, aux_trans, phrase_prob_dbm)490 if stems:491 valid_stems = set()492 for stem in stems:493 if stem in keys:494 stem_index[stem].append(word)495 valid_stems.add(stem)496 if valid_stems:497 entry["stem"] = list(valid_stems.union(set(entry.get("stem") or [])))498 for label, word_dict in word_dicts:499 if label not in self.core_labels: continue500 for key in keys:501 for entry in word_dict[key]:502 word = entry["word"]503 children = stem_index.get(word)504 if children:505 entry["stem_child"] = list(set(children))506 logger.info("Indexing stems done: num_stems={}, elapsed_time={:.2f}s".format(507 len(stem_index), time.time() - start_time))508 start_time = time.time()509 logger.info("Checking POS of words")510 noun_words = set()511 verb_words = set()512 adj_words = set()513 adv_words = set()514 for label, word_dict in word_dicts:515 if label in self.core_labels:516 for key in keys:517 for entry in word_dict[key]:518 word = entry["word"]519 for pos, text in entry["text"]:520 if pos == "noun": noun_words.add(word)521 if pos == "verb": verb_words.add(word)522 if pos == "adjective": adj_words.add(word)523 if pos == "adverb": adv_words.add(word)524 logger.info("Checking POS of words done: elapsed_time={:.2f}s".format(525 time.time() - start_time))526 start_time = time.time()527 logger.info("Indexing base forms")528 extra_word_bases = {}529 for label, word_dict in word_dicts:530 if label not in self.top_labels: continue531 base_index = collections.defaultdict(list)532 core_index = collections.defaultdict(list)533 for key, entries in word_dict.items():534 for entry in entries:535 word = entry["word"]536 if not regex.fullmatch("[a-z]+", word): continue537 if word in verb_words:538 children = set()539 for part_name in ("verb_present_participle", "verb_past_participle"):540 for part in (entry.get(part_name) or "").split(","):541 part = part.strip()542 if part and part != word and (part in noun_words or part in adj_words):543 base_index[part].append(word)544 extra_word_bases[part] = word545 children.add(part)546 if children:547 entry["base_child"] = list(children)548 if word in adj_words:549 children = set()550 for part_name in ("adjective_comparative", "adjective_superlative"):551 part = entry.get(part_name)552 if part and (part in noun_words or part in adj_words):553 base_index[part].append(word)554 children.add(part)555 if children:556 entry["base_child"] = list(children)557 core = entry.get("etymology_core")558 prefix = entry.get("etymology_prefix")559 suffix = entry.get("etymology_suffix")560 if core and len(core) >= 4 and not prefix and suffix:561 entry["core"] = core562 core_index[core].append(word)563 for key, entries in word_dict.items():564 for entry in entries:565 word = entry["word"]566 if not regex.fullmatch("[a-z]+", word): continue567 bases = base_index.get(word)568 if bases:569 entry["base"] = list(bases)570 children = core_index.get(word)571 if children:572 entry["core_child"] = list(children)573 logger.info("Indexing base forms done: elapsed_time={:.2f}s".format(574 time.time() - start_time))575 start_time = time.time()576 logger.info("Merging entries: num_keys={}".format(len(keys)))577 merged_entries = []578 for key in keys:579 merged_entry = self.MergeRecord(580 key, word_dicts, aux_trans, aoa_words, keywords,581 phrase_prob_dbm, tran_prob_dbm, rev_prob_dbm, cooc_prob_dbm)582 if not merged_entry: continue583 merged_entries.append((key, merged_entry))584 if len(merged_entries) % 1000 == 0:585 logger.info("Merging entries:: num_entries={}".format(len(merged_entries)))586 logger.info("Making records done: num_records={}, elapsed_time={:.2f}s".format(587 len(merged_entries), time.time() - start_time))588 start_time = time.time()589 logger.info("Modifying entries")590 merged_entries = sorted(merged_entries)591 live_words = tkrzw.DBM()592 live_words.Open("", True, dbm="BabyDBM").OrDie()593 rev_live_words = tkrzw.DBM()594 rev_live_words.Open("", True, dbm="BabyDBM").OrDie()595 for key, merged_entry in merged_entries:596 for word_entry in merged_entry:597 word = word_entry["word"]598 prob = float(word_entry.get("probability") or 0)599 value = "{:.8f}".format(prob)600 live_words.Set(word, value).OrDie()601 rev_word = " ".join(reversed(word.split(" ")))602 rev_live_words.Set(rev_word, value).OrDie()603 num_entries = 0604 for key, merged_entry in merged_entries:605 for word_entry in merged_entry:606 word = word_entry["word"]607 entries = []608 for label, word_dict in word_dicts:609 dict_entries = word_dict.get(key)610 if not dict_entries: continue611 for entry in dict_entries:612 if entry["word"] == word:613 entries.append((label, entry))614 self.SetAOA(word_entry, entries, aoa_words, live_words, phrase_prob_dbm)615 self.SetTranslations(word_entry, aux_trans, tran_prob_dbm, rev_prob_dbm)616 self.SetRelations(word_entry, entries, word_dicts, live_words, rev_live_words,617 phrase_prob_dbm, tran_prob_dbm, cooc_prob_dbm, extra_word_bases,618 verb_words, adj_words, adv_words)619 if phrase_prob_dbm and cooc_prob_dbm:620 self.SetCoocurrences(word_entry, entries, word_dicts, phrase_prob_dbm, cooc_prob_dbm)621 num_entries += 1622 if num_entries % 1000 == 0:623 logger.info("Modifying entries: num_records={}".format(num_entries))624 logger.info("Modifying entries done: elapsed_time={:.2f}s".format(time.time() - start_time))625 start_time = time.time()626 logger.info("Finishing entries")627 merged_dict = {}628 for key, merged_entry in merged_entries:629 merged_dict[key] = merged_entry630 num_entries = 0631 for key, merged_entry in merged_entries:632 for word_entry in merged_entry:633 self.CompensateInflections(word_entry, merged_dict, verb_words)634 self.CompensateAlternatives(word_entry, merged_dict)635 self.PropagateTranslations(word_entry, merged_dict, tran_prob_dbm, aux_last_trans)636 num_entries += 1637 if num_entries % 1000 == 0:638 logger.info("Finishing entries R1: num_records={}".format(num_entries))639 num_entries = 0640 for key, merged_entry in merged_entries:641 for word_entry in merged_entry:642 self.SetPhraseTranslations(word_entry, merged_dict, aux_trans, aux_last_trans,643 tran_prob_dbm, phrase_prob_dbm, noun_words, verb_words,644 live_words, rev_live_words)645 self.FilterParents(word_entry, merged_dict)646 self.AbsorbInflections(word_entry, merged_dict)647 num_entries += 1648 if num_entries % 1000 == 0:649 logger.info("Finishing entries R2: num_records={}".format(num_entries))650 logger.info("Finishing entries done: elapsed_time={:.2f}s".format(time.time() - start_time))651 rev_live_words.Close().OrDie()652 live_words.Close().OrDie()653 if cooc_prob_dbm:654 cooc_prob_dbm.Close().OrDie()655 if rev_prob_dbm:656 rev_prob_dbm.Close().OrDie()657 if tran_prob_dbm:658 tran_prob_dbm.Close().OrDie()659 if phrase_prob_dbm:660 phrase_prob_dbm.Close().OrDie()661 start_time = time.time()662 logger.info("Saving records: output_path={}".format(self.output_path))663 word_dbm = tkrzw.DBM()664 num_buckets = len(merged_entries) * 2665 word_dbm.Open(self.output_path, True, dbm="HashDBM", truncate=True,666 align_pow=0, num_buckets=num_buckets)667 num_records = 0668 for key, merged_entry in merged_entries:669 final_entry = []670 for word_entry in merged_entry:671 if word_entry.get("deleted"):672 continue673 for attr_name in list(word_entry.keys()):674 if attr_name.startswith("_"):675 del word_entry[attr_name]676 final_entry.append(word_entry)677 if not final_entry: continue678 serialized = json.dumps(final_entry, separators=(",", ":"), ensure_ascii=False)679 word_dbm.Set(key, serialized)680 num_records += 1681 if num_records % 1000 == 0:682 logger.info("Saving records: num_records={}".format(num_records))683 word_dbm.Close().OrDie()684 logger.info("Saving records done: num_records={}, elapsed_time={:.2f}s".format(685 len(merged_entries), time.time() - start_time))686 def GetDerivativeStems(self, label, entry, word_dict, aux_trans, phrase_prob_dbm):687 word = entry["word"]688 prob = 1.0689 if phrase_prob_dbm:690 prob = self.GetPhraseProb(phrase_prob_dbm, "en", word)691 texts = entry.get("text") or []692 def NormalizeTran(tran):693 tran = tran.strip()694 han_tran = regex.sub(r"[^\p{Han}]", "", tran)695 if len(han_tran) >= 2:696 tran = han_tran697 elif regex.fullmatch(r"\p{Han}\p{Hiragana}+", tran):698 poses = self.tokenizer.GetJaPosList(tran)699 while len(poses) >= 2:700 pos = poses[-1]701 if not regex.fullmatch(r"\p{Hiragana}+", pos[0]): break702 if pos[1] not in ["å©è©", "å©åè©"] and pos[2] not in ["æ¥å°¾", "éèªç«"]: break703 poses = poses[:-1]704 norm_tran = ""705 for pos in poses:706 norm_tran += pos[3]707 if len(norm_tran) >= 2:708 tran = norm_tran709 tran = regex.sub(r"^([\p{Han}]{2,})ç$", "\1", tran)710 return tran711 def GetMetadata(in_entry, out_poses, out_deris, out_trans):712 in_word = in_entry["word"]713 for pos, text in in_entry["text"]:714 out_poses.add(pos)715 for part in text.split("[-]"):716 part = part.strip()717 match = regex.search(r"^\[(synonym|derivative)\]: (.*)", part)718 if match:719 expr = regex.sub(r"\[-.*", "", match.group(2))720 for deri in expr.split(","):721 deri = deri.strip()722 if regex.fullmatch("[a-z]+", deri):723 out_deris.add(deri)724 match = regex.search(r"^\[translation\]: (.*)", part)725 if match:726 expr = regex.sub(r"\[-.*", "", match.group(1))727 expr = regex.sub(r"\(.*?\)", "", expr).strip()728 for tran in expr.split(","):729 tran = NormalizeTran(tran)730 if len(tran) >= 2:731 out_trans.add(tran)732 if label in self.gross_labels:733 text = regex.sub(r"\(.*?\)", "", text)734 text = regex.sub(r"ï¼.*?ï¼", "", text)735 for tran in regex.split(r"[,ãã]", text):736 tran = NormalizeTran(tran)737 if len(tran) >= 2:738 out_trans.add(tran)739 relation_expr = in_entry.get("relation")740 if relation_expr:741 for rel_word in relation_expr.split(","):742 check_len = max(5, len(rel_word) - 1)743 if rel_word[:check_len] == word[:check_len]:744 out_deris.add(rel_word)745 in_aux_trans = aux_trans.get(in_word)746 if in_aux_trans:747 for tran in in_aux_trans:748 tran = NormalizeTran(tran)749 if len(tran) >= 2:750 out_trans.add(tran)751 poses = set()752 deris = set()753 trans = set()754 GetMetadata(entry, poses, deris, trans)755 deri_probs = {}756 deri_trans = {}757 if phrase_prob_dbm:758 for deri in deris:759 if deri[:3] != word[:3]: continue760 deri_probs[deri] = self.GetPhraseProb(phrase_prob_dbm, "en", deri)761 for deri_entry in word_dict.get(deri) or []:762 if deri_entry["word"] == deri:763 one_deri_poses = set()764 one_deri_deris = set()765 one_deri_trans = set()766 GetMetadata(deri_entry, one_deri_poses, one_deri_deris, one_deri_trans)767 if one_deri_trans:768 deri_trans[deri] = one_deri_trans769 stems = set()770 for pos in poses:771 for rule_pos, suffixes in (772 ("noun", noun_suffixes),773 ("verb", verb_suffixes),774 ("adjective", adjective_suffixes),775 ("adverb", adverb_suffixes)):776 if pos == rule_pos:777 for suffix in suffixes:778 if word.endswith(suffix):779 stem = word[:-len(suffix)]780 if len(stem) >= 2:781 stems.add(stem)782 if len(suffix) >= 2 and stem[-1] == suffix[0]:783 stems.add(stem + suffix[0])784 if len(suffix) >= 2 and stem[-1] == "i":785 stems.add(stem[:-1] + "y")786 if len(suffix) >= 2 and suffix[0] == "i":787 stems.add(stem + "e")788 if len(suffix) >= 2 and suffix[0] == "e":789 stems.add(stem + "e")790 if len(suffix) >= 2 and suffix[0] == "t":791 stems.add(stem + "t")792 if len(suffix) >= 3 and suffix[0] == "s":793 stems.add(stem + "s")794 if suffix == "al" and len(stem) >= 3:795 stems.add(stem + "es")796 stems.add(stem + "e")797 if suffix == "y" and len(stem) >= 3:798 stems.add(stem + "e")799 if suffix in ["tion", "sion"] and len(stem) >= 2:800 stems.add(stem + "e")801 stems.add(stem + "d")802 stems.add(stem + "t")803 stems.add(stem + "s")804 stems.add(stem + "te")805 stems.add(stem + "de")806 stems.add(stem + "se")807 stems.add(stem + "ve")808 if suffix in ["tion", "sion"] and len(stem) >= 3 and stem.endswith("a"):809 stems.add(stem[:-1])810 stems.add(stem[:-1] + "e")811 stems.add(stem[:-1] + "ate")812 if suffix in ["tion", "sion"] and len(stem) >= 3 and stem.endswith("u"):813 stems.add(stem[:-1] + "ve")814 if suffix == "sion" and len(stem) >= 3 and stem.endswith("s"):815 stems.add(stem[:-1] + "t")816 if suffix in ["ible", "able"] and len(stem) >= 2:817 stems.add(stem + "or")818 stems.add(stem + "er")819 stems.add(stem + "ify")820 stems.add(stem + "y")821 if suffix == "ate":822 stems.add(stem + "e")823 if suffix == "al" and len(stem) >= 3 and stem.endswith("r"):824 stems.add(stem[:-1] + "er")825 if suffix == "ive" and len(stem) >= 3 and stem.endswith("s"):826 stems.add(stem[:-1] + "d")827 stems.add(stem[:-1] + "de")828 if suffix == "ic" and len(stem) >= 3:829 stems.add(stem + "y")830 if suffix == "ize" and len(stem) >= 3:831 stems.add(stem + "y")832 if suffix == "ity" and len(stem) >= 6 and stem.endswith("bil"):833 stems.add(stem[:-3] + "ble")834 if suffix == "pt" and len(stem) >= 3:835 stems.add(stem[:-1] + "ve")836 if suffix == "ce" and len(stem) >= 3:837 stems.add(stem + "t")838 stems.add(stem + "d")839 stems.add(stem + "se")840 if suffix == "ian" and len(stem) >= 4:841 stems.add(stem + "y")842 if suffix == "cy" and len(stem) >= 4:843 stems.add(stem + "t")844 if suffix == "faction" and len(stem) >= 4:845 stems.add(stem + "fy")846 if suffix == "ous" and len(stem) >= 4:847 stems.add(stem + "on")848 stems.add(stem + "y")849 stems.add(stem + "e")850 if suffix == "ous" and len(stem) >= 5 and stem.endswith("ul"):851 stems.add(stem[:-2] + "le")852 if suffix == "ant" and len(stem) >= 4:853 stems.add(stem + "ate")854 stems.add(stem + "e")855 if suffix == "ative" and len(stem) >= 4:856 stems.add(stem + "e")857 if suffix in ["er", "or", "ive"] and len(stem) >= 5:858 stems.add(stem + "e")859 if len(stem) >= 3 and stem.endswith("u"):860 stems.add(stem + "e")861 if len(stem) >= 4 and stem.endswith("i"):862 stems.add(stem[:-1] + "e")863 if len(stem) >= 4 and stem.endswith("rr"):864 stems.add(stem[:-1])865 if len(stem) >= 5 and stem.endswith("t"):866 stems.add(stem[:-1] + "ce")867 stems.add(stem[:-1] + "d")868 if len(stem) >= 5 and stem.endswith("v"):869 stems.add(stem + "e")870 if len(stem) >= 8 and stem.endswith("tic"):871 stems.add(stem + "s")872 if len(stem) >= 4 and stem[-1] == stem[-2]:873 stems.add(stem[:-1])874 stems.discard(word)875 #print("STEM", label, word, stems)876 877 valid_stems = set()878 for pos, text in texts:879 match = regex.search(880 r'^[" ]*([\p{Latin}]+)[" ]*ã®(è¤æ°å½¢|ä¸äººç§°|ååè©|ç¾å¨åè©|éå»å½¢|éå»åè©)', text)881 if match:882 stem = match.group(1)883 if len(stem) >= 4 and word.startswith(stem):884 valid_stems.add(stem)885 for stem in stems:886 if phrase_prob_dbm:887 stem_prob = self.GetPhraseProb(phrase_prob_dbm, "en", stem)888 else:889 stem_prob = prob890 stem_prob_ratio = stem_prob / prob891 if label not in self.core_labels and stem_prob_ratio < 0.1: continue892 if (stem.find(" ") < 0 and len(stem) >= 8 and len(stem) < len(word) and893 stem_prob_ratio >= 0.5):894 valid_stems.add(stem)895 continue896 stem_entry = None897 for tmp_stem_entry in word_dict.get(stem) or []:898 if tmp_stem_entry["word"] == stem:899 stem_entry = tmp_stem_entry900 is_known = False901 if stem_prob_ratio >= 0.001:902 is_known = True903 if stem_entry and word in (stem_entry.get("related") or []):904 is_known = True905 if len(stem) >= 6 and stem_prob_ratio >= 0.0002:906 is_known = True907 stem_poses = set()908 stem_deris = set()909 stem_trans = set()910 if stem_entry:911 GetMetadata(stem_entry, stem_poses, stem_deris, stem_trans)912 if stem.find(" ") < 0 and len(stem) >= 4 and trans:913 hit_deri = False914 if word in stem_deris:915 hit_deri = True916 hit_tran = False917 for stem_tran in stem_trans:918 if stem_tran in trans:919 hit_tran = True920 if regex.search(r"\p{Han}", stem_tran):921 for tran in trans:922 if tran.find(stem_tran) >= 0 or stem_tran.find(tran) >= 0:923 hit_tran = True924 if ((hit_deri and hit_tran) or (stem_prob_ratio >= 0.1 and hit_deri) or925 (is_known and hit_tran)):926 valid_stems.add(stem)927 check_len = max(3, len(stem) - 2)928 for deri in deris:929 if len(word) < len(deri):930 continue931 deri_prob = deri_probs.get(deri) or 0.0932 deri_prob_ratio = deri_prob / prob933 hit_deri = False934 if deri == stem:935 hit_deri = True936 if stem[:check_len] == deri[:check_len] and len(stem) >= 4:937 prefix = deri[:len(stem)]938 if prefix == stem:939 hit_deri = True940 if len(prefix) >= 6 and tkrzw.Utility.EditDistanceLev(stem, prefix) < 2:941 hit_deri = True942 hit_tran = False943 for deri_tran in deri_trans.get(deri) or []:944 if deri_tran in trans:945 hit_tran = True946 if regex.search(r"\p{Han}", deri_tran):947 for tran in trans:948 if tran.find(deri_tran) >= 0:949 hit_tran = True950 if hit_deri and (deri_prob_ratio >= 0.1 or hit_tran):951 valid_stems.add(deri)952 force_parent = force_parents.get(word)953 if force_parent:954 valid_stems.clear()955 valid_stems.add(force_parent)956 valid_stems.discard(word)957 #print("VALID", word, valid_stems)958 959 return list(valid_stems)960 def MergeRecord(self, key, word_dicts, aux_trans, aoa_words, keywords,961 phrase_prob_dbm, tran_prob_dbm, rev_prob_dbm, cooc_prob_dbm):962 word_entries = {}963 word_shares = collections.defaultdict(float)964 word_trans = collections.defaultdict(set)965 entry_tran_texts = collections.defaultdict(list)966 num_words = 0967 poses = collections.defaultdict(set)968 synonyms = collections.defaultdict(set)969 core = None970 for label, word_dict in word_dicts:971 dict_entries = word_dict.get(key)972 if not dict_entries: continue973 for entry in dict_entries:974 num_words += 1975 word = entry["word"]976 entries = word_entries.get(word) or []977 entries.append((label, entry))978 word_entries[word] = entries979 texts = entry.get("text")980 if texts:981 text_score = len(texts) * 1.0982 for pos, text in texts:983 poses[word].add(pos)984 trans = self.ExtractTextLabelTrans(text)985 if trans:986 text_score += 0.5987 word_trans[word].update(trans)988 word_shares[word] += math.log2(1 + text_score)989 expr = entry.get("synonym")990 if expr:991 for synonym in regex.split(r"[,;]", expr):992 synonym = synonym.strip()993 if regex.search(r"\p{Latin}", synonym) and synonym.lower() != word.lower():994 synonyms[word].add(synonym)995 if not core:996 core = entry.get("core")997 dict_entries = word_dict.get(key + "\ttranslation")998 if dict_entries:999 for entry in dict_entries:1000 word = entry["word"]1001 tran_texts = entry.get("text")1002 if not tran_texts: continue1003 for tran_pos, tran_text in tran_texts:1004 tran_key = word + "\t" + label + "\t" + tran_pos1005 entry_tran_texts[tran_key].append(tran_text)1006 trans = self.ExtractTextLabelTrans(tran_text)1007 if trans:1008 word_trans[word].update(trans)1009 sorted_word_shares = sorted(word_shares.items(), key=lambda x: x[1], reverse=True)1010 if len(sorted_word_shares) > 1 and aux_trans and tran_prob_dbm:1011 spell_ratios = {}1012 if phrase_prob_dbm:1013 word_probs = {}1014 for word, share in sorted_word_shares:1015 if word in word_probs: continue1016 prob = self.GetPhraseProb(phrase_prob_dbm, "en", word)1017 if not regex.search(r"\p{Lu}", word):1018 prob *= 1.11019 word_probs[word] = prob1020 sum_prob = sum([x[1] for x in word_probs.items()])1021 for word, prob in word_probs.items():1022 spell_ratios[word] = prob / sum_prob1023 word_scores = []1024 for word, share in sorted_word_shares:1025 score = 0.01026 if word in keywords:1027 score += 0.11028 cap_aux_trans = aux_trans.get(word) or []1029 if cap_aux_trans:1030 score += 0.11031 cap_word_trans = word_trans.get(word) or []1032 cap_trans = set(cap_aux_trans).union(cap_word_trans)1033 tran_score = 0.01034 if cap_trans:1035 key = tkrzw_dict.NormalizeWord(word)1036 tsv = tran_prob_dbm.GetStr(key)1037 if tsv:1038 fields = tsv.split("\t")1039 max_prob = 0.01040 sum_prob = 0.01041 for i in range(0, len(fields), 3):1042 src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2])1043 if src != word:1044 prob *= 0.11045 if not regex.search(r"[\p{Han}\{Hiragana}]", trg):1046 prob *= 0.51047 if regex.search(r"\p{Lu}", src):1048 prob *= 0.51049 if trg in cap_trans:1050 max_prob = max(max_prob, prob)1051 sum_prob += prob1052 tran_score += (sum_prob * max_prob) ** 0.51053 spell_score = (spell_ratios.get(word) or 0.0) * 0.51054 score += ((tran_score + 0.05) * (share + 0.05) * (spell_score + 0.05)) ** (1 / 3)1055 word_scores.append((word, score))1056 sorted_word_shares = sorted(word_scores, key=lambda x: x[1], reverse=True)1057 share_sum = sum([x[1] for x in sorted_word_shares])1058 merged_entry = []1059 for word, share in sorted_word_shares:1060 entries = word_entries[word]1061 word_entry = {}1062 word_entry["word"] = word1063 stem = " ".join(self.tokenizer.Tokenize("en", word, False, True))1064 effective_labels = set()1065 surfaces = set([word.lower()])1066 is_keyword = (word in aux_trans or word in aoa_words or word in keywords or1067 (core and core in keywords))1068 word_poses = poses[word]1069 for pos in word_poses:1070 for rule_pos, suffixes in (1071 ("noun", noun_suffixes),1072 ("verb", verb_suffixes),1073 ("adjective", adjective_suffixes),1074 ("adverb", adverb_suffixes)):1075 if pos == rule_pos:1076 for suffix in suffixes:1077 if word.endswith(suffix):1078 pos_stem = word[:-len(suffix)]1079 if len(pos_stem) >= 4:1080 pos_stems = set()1081 pos_stems.add(pos_stem)1082 pos_stems.add(regex.sub(r"i$", r"y", pos_stem))1083 for pos_stem in pos_stems:1084 if pos_stem in aux_trans or pos_stem in aoa_words or pos_stem in keywords:1085 is_keyword = True1086 break1087 if not is_keyword and "verb" in word_poses and regex.fullmatch(r"[a-z ]+", word):1088 tokens = self.tokenizer.Tokenize("en", word, False, False)1089 if len(tokens) >= 2 and tokens[0] in keywords:1090 particle_suffix = True1091 for token in tokens[1:]:1092 if not token in particles:1093 particle_suffix = False1094 break1095 if particle_suffix:1096 is_keyword = True1097 is_super_keyword = is_keyword and bool(regex.fullmatch(r"\p{Latin}{3,}", word))1098 for label, entry in entries:1099 if label not in self.surfeit_labels or is_keyword:1100 effective_labels.add(label)1101 for top_name in top_names:1102 if label not in self.top_labels and top_name in word_entry: continue1103 value = entry.get(top_name)1104 if value:1105 value = unicodedata.normalize('NFKC', value)1106 word_entry[top_name] = value1107 for infl_name in inflection_names:1108 value = entry.get(infl_name)1109 if value:1110 surfaces.add(value.lower())1111 if merged_entry and not effective_labels:1112 continue1113 for label, entry in entries:1114 texts = entry.get("text")1115 if not texts: continue1116 has_good_text = False1117 for pos, text in texts:1118 pure_text = regex.sub(r"[^\p{Latin}\p{Han}\p{Hiragana}\p{Katakana}\d]", "", text)1119 if not pure_text or pure_text == stem:1120 continue1121 has_good_text = True1122 if not has_good_text:1123 continue1124 for pos, text in texts:1125 items = word_entry.get("item") or []1126 tran_key = word + "\t" + label + "\t" + pos1127 sections = []1128 for section in text.split(" [-] "):1129 if not sections:1130 sections.append(section)1131 continue1132 eg_match = regex.search(r"^e\.g\.: (.*)", section)1133 if eg_match:1134 eg_text = eg_match.group(1).lower()1135 eg_words = regex.findall("[-\p{Latin}]+", eg_text)1136 hit = False1137 for surface in surfaces:1138 if surface in eg_words:1139 hit = True1140 break1141 if not hit: continue1142 sections.append(section)1143 text = " [-] ".join(sections)1144 tran_texts = entry_tran_texts.get(tran_key)1145 if tran_texts:1146 del entry_tran_texts[tran_key]1147 for tran_text in tran_texts:1148 tran_item = {"label": label, "pos": pos, "text": tran_text}1149 items.append(tran_item)1150 item = {"label": label, "pos": pos, "text": text}1151 items.append(item)1152 word_entry["item"] = items1153 if "item" not in word_entry:1154 continue1155 num_eff_items = 01156 for item in word_entry["item"]:1157 text = item["text"]1158 if regex.search(r" (of|for) +\"", text) and len(text) < 50:1159 continue1160 if (regex.search(r"\p{Latin}.*ã®.*(åæ°|è¤æ°|ç¾å¨|éå»|æ¯è¼|æä¸).*(å½¢|ç´|åè©)", text) and1161 len(text) < 30):1162 continue1163 num_eff_items += 11164 if num_eff_items == 0:1165 continue1166 prob = None1167 if phrase_prob_dbm:1168 prob = self.GetPhraseProb(phrase_prob_dbm, "en", word)1169 if stem.lower() != word.lower():1170 if word.endswith("ics"):1171 prob *= 1.11172 elif word.count(" "):1173 prob *= 0.51174 else:1175 prob *= 0.11176 word_entry["probability"] = "{:.7f}".format(prob).replace("0.", ".")1177 if self.min_prob_map:1178 has_good_label = False1179 for item in word_entry["item"]:1180 if item["label"] not in self.min_prob_map:1181 has_good_label = True1182 break1183 if not has_good_label:1184 new_items = []1185 for item in word_entry["item"]:1186 is_good_item = True1187 for label, min_prob in self.min_prob_map.items():1188 if item["label"] == label:1189 if is_keyword:1190 min_prob *= 0.11191 if is_super_keyword:1192 norm_text = tkrzw_dict.NormalizeWord(item["text"])1193 norm_text = regex.sub(r"^(to|a|an|the) +([\p{Latin}])", r"\2", norm_text)1194 dist = tkrzw.Utility.EditDistanceLev(key, norm_text)1195 dist /= max(len(key), len(norm_text))1196 if dist > 0.5 or word in aux_trans or (core and core in aux_trans):1197 min_prob = 0.01198 if prob < min_prob:1199 is_good_item = False1200 if is_good_item:1201 new_items.append(item)1202 word_entry["item"] = new_items1203 if not word_entry.get("item"):1204 continue1205 share_ratio = share / share_sum1206 if share_ratio < 1:1207 word_entry["share"] = "{:.3f}".format(share_ratio).replace("0.", ".")1208 uniq_alternatives = set()1209 scored_alternatives = []1210 for label, entry in entries:1211 alternatives = entry.get("alternative")1212 if alternatives:1213 for alternative in alternatives:1214 norm_alt = tkrzw_dict.NormalizeWord(alternative)1215 if norm_alt == key: continue1216 if label not in self.core_labels:1217 dist = tkrzw.Utility.EditDistanceLev(key, norm_alt)1218 dist_ratio = dist / max(len(key), len(norm_alt))1219 if dist > 4 or dist_ratio > 0.3: continue1220 if alternative not in uniq_alternatives:1221 alt_prob = self.GetPhraseProb(phrase_prob_dbm, "en", alternative)1222 scored_alternatives.append((alternative, alt_prob))1223 uniq_alternatives.add(alternative)1224 if scored_alternatives:1225 scored_alternatives = sorted(scored_alternatives, key=lambda x: x[1], reverse=True)1226 word_entry["alternative"] = [x[0] for x in scored_alternatives]1227 word_synonyms = synonyms[word]1228 if word_synonyms:1229 word_entry["_synonym"] = list(word_synonyms)1230 merged_entry.append(word_entry)1231 return merged_entry1232 def GetPhraseProb(self, prob_dbm, language, word):1233 base_prob = 0.0000000011234 tokens = self.tokenizer.Tokenize(language, word, False, True)1235 if not tokens: return base_prob1236 max_ngram = min(3, len(tokens))1237 fallback_penalty = 1.01238 for ngram in range(max_ngram, 0, -1):1239 if len(tokens) <= ngram:1240 cur_phrase = " ".join(tokens)1241 prob = float(prob_dbm.GetStr(cur_phrase) or 0.0)1242 if prob:1243 return max(prob, base_prob)1244 fallback_penalty *= 0.11245 else:1246 probs = []1247 index = 01248 miss = False1249 while index <= len(tokens) - ngram:1250 cur_phrase = " ".join(tokens[index:index + ngram])1251 cur_prob = float(prob_dbm.GetStr(cur_phrase) or 0.0)1252 if not cur_prob:1253 miss = True1254 break1255 probs.append(cur_prob)1256 index += 11257 if not miss:1258 inv_sum = 01259 for cur_prob in probs:1260 inv_sum += 1 / cur_prob1261 prob = len(probs) / inv_sum1262 prob *= 0.3 ** (len(tokens) - ngram)1263 prob *= fallback_penalty1264 return max(prob, base_prob)1265 fallback_penalty *= 0.11266 return base_prob1267 def SetAOA(self, word_entry, entries, aoa_words, live_words, phrase_prob_dbm):1268 word = word_entry["word"]1269 phrase_prob = min(float(word_entry.get("probability") or 0), 0.0000001)1270 share = float(word_entry.get("share") or 1)1271 share_bias = 0.01272 if share < 0.5:1273 share_bias = (0.5 - share) * 41274 aoa = aoa_words.get(word)1275 if aoa:1276 aoa += share_bias1277 word_entry["aoa"] = "{:.3f}".format(aoa)1278 concepts = set()1279 for label, entry in entries:1280 stems = entry.get("stem")1281 if stems:1282 for stem in stems:1283 concepts.add(stem)1284 core = entry.get("core")1285 if core:1286 concepts.add(core)1287 min_aoa = sys.maxsize1288 for concept in concepts:1289 if not live_words.Get(concept):1290 continue1291 aoa = aoa_words.get(concept)1292 if aoa:1293 if phrase_prob and phrase_prob_dbm:1294 concept_prob = self.GetPhraseProb(phrase_prob_dbm, "en", concept)1295 diff = max(math.log(concept_prob) - math.log(phrase_prob), 0.0)1296 aoa += min(diff * 1.0, 1.0)1297 else:1298 aoa += 1.01299 min_aoa = min(min_aoa, aoa)1300 if min_aoa < sys.maxsize:1301 min_aoa += share_bias1302 word_entry["aoa_concept"] = "{:.3f}".format(min_aoa)1303 bases = set()1304 for label, entry in entries:1305 tmp_bases = entry.get("base")1306 if tmp_bases:1307 for base in tmp_bases:1308 bases.add(base)1309 stem = " ".join(self.tokenizer.Tokenize("en", word, False, True))1310 if stem != word:1311 bases.add(stem)1312 min_aoa = sys.maxsize1313 for base in bases:1314 if not live_words.Get(base):1315 continue1316 aoa = aoa_words.get(base)1317 if aoa:1318 aoa += 1.01319 min_aoa = min(min_aoa, aoa)1320 if min_aoa < sys.maxsize:1321 min_aoa += share_bias1322 word_entry["aoa_base"] = "{:.3f}".format(min_aoa)1323 def ExtractTextLabelTrans(self, text):1324 trans = []1325 match = regex.search(r"\[translation\]: ", text)1326 if match:1327 text = text[match.end():]1328 text = regex.sub(r"\[-.*", "", text)1329 text = regex.sub(r"\(.*?\)", "", text)1330 for tran in text.split(","):1331 tran = unicodedata.normalize('NFKC', tran)1332 tran = tran.strip()1333 tran = regex.sub(r"[\p{S}\p{P}]+ *(ã|ã®|ã|ã«|ã¸|ã¨|ãã|ãã|ã§|ã)", "", tran)1334 tran = regex.sub(r"[ï½\p{S}\p{P}]", " ", tran)1335 tran = regex.sub(r" +(?=[\p{Han}\p{Hiragana}\p{Katakana}ã¼])", "", tran)1336 tran = regex.sub(r"[\p{Z}]+", " ", tran).strip()1337 if tran:1338 trans.append(tran)1339 return trans1340 def SetTranslations(self, entry, aux_trans, tran_prob_dbm, rev_prob_dbm):1341 word = entry["word"]1342 tran_probs = {}1343 if tran_prob_dbm:1344 key = tkrzw_dict.NormalizeWord(word)1345 tsv = tran_prob_dbm.GetStr(key)1346 if tsv:1347 fields = tsv.split("\t")1348 extra_records = []1349 for i in range(0, len(fields), 3):1350 src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2])1351 if regex.search("[ã£ã]$", trg) and self.tokenizer.GetJaLastPos(trg)[1] == "åè©":1352 continue1353 if src != word:1354 prob *= 0.11355 norm_trg = tkrzw_dict.NormalizeWord(trg)1356 if tkrzw_dict.IsStopWord("ja", norm_trg):1357 prob *= 0.71358 elif len(norm_trg) < 2:1359 prob *= 0.91360 prob **= 0.81361 tran_probs[norm_trg] = max(tran_probs.get(norm_trg) or 0.0, prob)1362 stem_trg = regex.sub(1363 r"([\p{Han}\p{Katakana}ã¼]{2,})(ãã|ãããã¨|ããã|ããããã¨|ããã)$",1364 r"\1", norm_trg)1365 if stem_trg != norm_trg:1366 extra_records.append((stem_trg, prob * 0.5))1367 stem_trg = self.tokenizer.CutJaWordNounParticle(norm_trg)1368 if stem_trg != norm_trg:1369 extra_records.append((stem_trg, prob * 0.5))1370 stem_trg = regex.sub(r"([\p{Han}\p{Katakana}ã¼]{2,})(ç|çãª|çã«)$", r"\1", norm_trg)1371 if stem_trg != norm_trg:1372 extra_records.append((stem_trg, prob * 0.5))1373 if self.tokenizer.IsJaWordSahenNoun(norm_trg):1374 long_trg = norm_trg + "ãã"1375 extra_records.append((long_trg, prob * 0.5))1376 for extra_trg, extra_prob in extra_records:1377 tran_probs[extra_trg] = max(tran_probs.get(extra_trg) or 0.0, extra_prob)1378 word_aux_trans = aux_trans.get(word)1379 count_aux_trans = {}1380 if word_aux_trans:1381 for aux_tran in word_aux_trans:1382 count = (count_aux_trans.get(aux_tran) or 0) + 11383 count_aux_trans[aux_tran] = count1384 aux_weight = 1.01385 extra_records = []1386 for aux_tran, count in count_aux_trans.items():1387 aux_score = (0.01 ** (1 / (count + 1))) * aux_weight1388 prob = (tran_probs.get(aux_tran) or 0) + aux_score1389 tran_probs[aux_tran] = prob1390 stem_tran = regex.sub(1391 r"([\p{Han}\p{Katakana}ã¼]{2,})(ãã|ãããã¨|ããã|ããããã¨|ããã)$",1392 r"\1", aux_tran)1393 if stem_tran != aux_tran:1394 extra_records.append((stem_tran, aux_score * 0.5))1395 stem_tran = self.tokenizer.CutJaWordNounParticle(aux_tran)1396 if stem_tran != aux_tran:1397 extra_records.append((stem_tran, aux_score * 0.5))1398 stem_tran = regex.sub(r"([\p{Han}\p{Katakana}ã¼]{2,})(ç|çãª|çã«)$", r"\1", aux_tran)1399 if stem_tran != aux_tran:1400 extra_records.append((stem_tran, aux_score * 0.5))1401 if self.tokenizer.IsJaWordSahenNoun(aux_tran):1402 long_tran = aux_tran + "ãã"1403 extra_records.append((long_tran, aux_score * 0.5))1404 aux_weight *= 0.91405 for extra_tran, extra_prob in extra_records:1406 tran_probs[extra_tran] = max(tran_probs.get(extra_tran) or 0.0, extra_prob)1407 translations = {}1408 tran_labels = {}1409 def Vote(tran, weight, label):1410 if regex.search(r"^(noun|verb|adj|adv|[0-9])[^\p{Latin}]", tran):1411 return1412 norm_tran = tkrzw_dict.NormalizeWord(tran)1413 score = 0.000011414 if rev_prob_dbm:1415 prob = self.GetPhraseProb(rev_prob_dbm, "ja", tran)1416 prob = max(prob, 0.0000001)1417 prob = math.exp(-abs(math.log(0.001) - math.log(prob))) * 0.11418 if tkrzw_dict.IsStopWord("ja", tran) or tran in ("åã¯"):1419 prob *= 0.51420 score += prob1421 score *= weight1422 old_score = translations.get(tran) or 0.01423 translations[tran] = max(old_score, score)1424 if label:1425 old_labels = tran_labels.get(norm_tran) or set()1426 old_labels.add(label)1427 tran_labels[norm_tran] = old_labels1428 body_weight = 1.01429 tran_weight = 0.71430 for item in entry["item"]:1431 label = item["label"]1432 pos = item["pos"]1433 sections = item["text"].split(" [-] ")1434 text = sections[0]1435 text = regex.sub(r"ã *(ã¾ã|ã¾ãã¯|åã¯)ã.*?ã", r"ã", text)1436 if (label in self.gross_labels and1437 regex.search(r"[\p{Han}\p{Hiragana}\p{Katakana}ã¼]", text)):1438 weight = body_weight1439 body_weight *= 0.91440 if regex.search(r"[\(ï¼ãã\{\(]([^)ï¼ãã\}\]]+[ã»ã])?" +1441 r"(ä¿|ä¿èª|ã¹ã©ã³ã°|å|åèª|é èª|å¤|å¤èª|å»|å»ç¨|å»èª)+[)ï¼ãã\}\]]",1442 text):1443 weight *= 0.11444 text = regex.sub(r"[\(ï¼ãã\{\(].*?[)ï¼ãã\}\]]", "ã", text)1445 text = regex.sub(r"[ï½¥ã»]", "", text)1446 text = regex.sub(r"\p{Z}+", " ", text).strip()1447 if regex.search(1448 r"ã®(ç´æ¥æ³|ç´èª¬æ³|ä»®å®æ³)?(ç¾å¨|éå»)?(第?[ä¸äºä¸]人称)?[ ã»ï½¥ã]?" +1449 r"(åæ°|è¤æ°|ç¾å¨|éå»|æ¯è¼|æä¸|é²è¡|å®äº|ååè©|åç´|縮ç´)+[ ã»ï½¥ã]?" +1450 r"(å½¢|å|åè©|ç´|ååè©|åè©|åè©|形容è©|å¯è©)+", text):1451 continue1452 if regex.search(r"ã®(ç´æ¥æ³|ç´èª¬æ³|ä»®å®æ³)(ç¾å¨|éå»)", text):1453 continue1454 if regex.search(r"ã®(ååè©|ç°ç¶´|ç°ä½|ç°å½¢|å¤èª|ç¥|çç¥|ç縮|é åèª)", text):1455 continue1456 if regex.search(r"ãã®ä»ã[^ãã]{12,}", text):1457 continue1458 text = regex.sub(r" \[-+\] .*", "", text).strip()1459 text = regex.sub(r" -+ .*", "", text).strip()1460 for tran in regex.split("[ã|ã|ï¼|,|;]", text):1461 if len(translations) > 1:1462 if tran in ("ã¾ã", "ã¾ãã¯", "åã¯", "ãã°ãã°"):1463 continue1464 if regex.search(r"^[ \p{Latin}]+ã", tran):1465 continue1466 tran = regex.sub(r"^[\p{S}\p{P}]+ *(ã|ã®|ã|ã«|ã¸|ã¨|ãã|ãã|ã§|ã)", "", tran)1467 tran = regex.sub(r"[ï½ã]", "", tran)1468 tokens = self.tokenizer.Tokenize("ja", tran, False, False)1469 if len(tokens) > 6:1470 break1471 if regex.search(r"^[ \p{Latin}]+ *ãªã©", tran):1472 continue1473 if regex.search(r"[\p{Latin}].*ã®.*(è©å½¢|綴ã)$", tran):1474 continue1475 tran = " ".join(tokens)1476 tran = regex.sub(r"([\p{Han}\p{Hiragana}\p{Katakana}ã¼]) +", r"\1", tran)1477 tran = regex.sub(r" +([\p{Han}\p{Hiragana}\p{Katakana}ã¼])", r"\1", tran)1478 tran = regex.sub(r"[\p{Z}]+", " ", tran).strip()1479 if tran:1480 Vote(tran, weight, label)1481 weight *= 0.81482 if label in self.tran_list_labels:1483 for section in sections:1484 trans = self.ExtractTextLabelTrans(section)1485 if not trans: continue1486 weight = tran_weight1487 tran_weight *= 0.91488 uniq_trans = set()1489 for tran in trans:1490 norm_tran = self.tokenizer.NormalizeJaWordForPos(pos, tran)1491 if norm_tran and norm_tran not in uniq_trans:1492 Vote(norm_tran, weight, label)1493 weight *= 0.81494 uniq_trans.add(norm_tran)1495 if label in self.supplement_labels:1496 text = sections[0]1497 uniq_trans = set()1498 for tran in regex.split("[;,]", text):1499 norm_tran = self.tokenizer.NormalizeJaWordForPos(pos, tran.strip())1500 if norm_tran and norm_tran not in uniq_trans:1501 Vote(norm_tran, 0.01, "")1502 pos_scores = {}1503 pos_base_score = 1.01504 for item in entry["item"]:1505 pos = item["pos"]1506 score = pos_base_score1507 if item["label"] not in self.core_labels:1508 score *= 0.751509 pos_scores[pos] = (pos_scores.get(pos) or 0.0) + score1510 pos_base_score *= 0.91511 pos_sum_score = 0.0011512 for pos, score in pos_scores.items():1513 pos_sum_score += score1514 pure_noun = (pos_scores.get("noun") or 0.0) / pos_sum_score >= 0.91515 pure_verb = (pos_scores.get("verb") or 0.0) / pos_sum_score >= 0.91516 pure_adjective = (pos_scores.get("adjective") or 0.0) / pos_sum_score >= 0.91517 pure_adverb = (pos_scores.get("adverb") or 0.0) / pos_sum_score >= 0.91518 bonus_translations = []1519 scored_translations = set()1520 for tran, score in translations.items():1521 tran = unicodedata.normalize('NFKC', tran)1522 norm_tran = tkrzw_dict.NormalizeWord(tran)1523 prob = tran_probs.get(norm_tran)1524 if prob:1525 if len(norm_tran) < 2:1526 prob *= 0.51527 score += prob1528 del tran_probs[norm_tran]1529 scored_translations.add(norm_tran)1530 bonus_translations.append((tran, score))1531 sorted_translations = []1532 for tran, score in bonus_translations:1533 norm_tran = tkrzw_dict.NormalizeWord(tran)1534 if norm_tran not in scored_translations:1535 bonus = 0.01536 for dict_tran, prob in tran_probs.items():1537 if len(dict_tran) >= 2 and norm_tran.startswith(dict_tran):1538 bonus = max(bonus, prob * 0.3)1539 elif len(norm_tran) >= 2 and dict_tran.startswith(norm_tran):1540 bonus = max(bonus, prob * 0.2)1541 elif len(dict_tran) >= 2 and norm_tran.find(dict_tran) >= 0:1542 bonus = max(bonus, prob * 0.1)1543 elif len(norm_tran) >= 2 and dict_tran.find(norm_tran) >= 0:1544 bonus = max(bonus, prob * 0.1)1545 score += bonus1546 if norm_tran in tran_labels:1547 score += (len(tran_labels[norm_tran]) - 1) * 0.0011548 tran_pos = self.tokenizer.GetJaLastPos(tran)1549 if pure_noun:1550 if tran_pos[1] == "åè©" and regex.search(r"\p{Han}", tran):1551 score *= 1.21552 if pure_verb:1553 if tran_pos[1] == "åè©":1554 if regex.search("[ãããã¤ã¬ãµããã]$", tran):1555 score *= 1.31556 elif self.tokenizer.IsJaWordSahenNoun(tran):1557 score *= 1.21558 if pure_adjective:1559 tran_pos = self.tokenizer.GetJaLastPos(tran)1560 if tran_pos[1] == "形容è©" or self.tokenizer.IsJaWordAdjvNoun(tran):1561 score *= 1.21562 if (pure_verb or pure_adjective or pure_adverb):1563 if len(tran) <= 1:1564 score *= 0.81565 if regex.search(r"[\p{Katakana}]", tran):1566 score *= 0.71567 if regex.fullmatch(r"[\p{Katakana}ã¼]+", tran):1568 score *= 0.71569 elif regex.fullmatch(r"[\p{Hiragana}ã¼]+", tran):1570 score *= 0.91571 elif not regex.search(r"[\p{Han}\p{Hiragana}\p{Katakana}]+", tran):1572 score *= 0.71573 else:1574 if regex.search(r"[\p{Katakana}]", tran):1575 score *= 0.81576 if regex.fullmatch(r"[\p{Katakana}ã¼]+", tran):1577 score *= 0.81578 elif regex.fullmatch(r"[\p{Hiragana}ã¼]+", tran):1579 score *= 0.951580 elif not regex.search(r"[\p{Han}\p{Hiragana}\p{Katakana}]+", tran):1581 score *= 0.81582 sorted_translations.append((tran, score))1583 sorted_translations = sorted(sorted_translations, key=lambda x: x[1], reverse=True)1584 deduped_translations = []1585 for tran, score in sorted_translations:1586 norm_tran = tkrzw_dict.NormalizeWord(tran)1587 bias = 1.01588 for prev_tran, prev_score in deduped_translations:1589 if len(prev_tran) >= 2 and norm_tran.startswith(prev_tran):1590 bias = min(bias, 0.4 if len(prev_tran) >= 2 else 0.6)1591 elif len(norm_tran) >= 2 and prev_tran.startswith(norm_tran):1592 bias = min(bias, 0.6 if len(norm_tran) >= 2 else 0.7)1593 elif len(prev_tran) >= 2 and norm_tran.find(prev_tran) >= 0:1594 bias = min(bias, 0.8 if len(prev_tran) >= 2 else 0.9)1595 elif len(norm_tran) >= 2 and prev_tran.find(norm_tran) >= 0:1596 bias = min(bias, 0.8 if len(norm_tran) >= 2 else 0.9)1597 dist = tkrzw.Utility.EditDistanceLev(norm_tran, prev_tran)1598 dist /= max(len(norm_tran), len(prev_tran))1599 if dist < 0.3:1600 bias = min(bias, dist + 0.2)1601 score *= bias1602 deduped_translations.append((tran, score))1603 deduped_translations = sorted(deduped_translations, key=lambda x: x[1], reverse=True)1604 uniq_trans = set()1605 final_translations = []1606 max_elems = int(min(max(math.log2(len(entry["item"])), 2), 8) * 8)1607 for tran, score in deduped_translations:1608 tran = regex.sub(r"^ã.*", "", tran)1609 tran = regex.sub(r"ã»", "", tran)1610 norm_tran = tkrzw_dict.NormalizeWord(tran)1611 if not norm_tran or norm_tran in uniq_trans:1612 continue1613 uniq_trans.add(norm_tran)1614 match = regex.search("(.*)(ããã|ããã|ãã)$", norm_tran)1615 if match:1616 uniq_trans.add(match.group(1) + "ãã")1617 uniq_trans.add(match.group(1) + "ããã")1618 uniq_trans.add(match.group(1) + "ããã")1619 if len(final_translations) < max_elems or score >= 0.001:1620 final_translations.append(tran)1621 sorted_aux_trans = sorted(count_aux_trans.items(), key=lambda x: -x[1])1622 for aux_tran, count in sorted_aux_trans:1623 aux_tran = regex.sub(r"^ã.*", "", aux_tran)1624 aux_tran = regex.sub(r"ã»", "", aux_tran)1625 if pure_noun:1626 aux_tran = self.MakeTranNoun(aux_tran)1627 if pure_verb:1628 aux_tran = self.MakeTranVerb(aux_tran)1629 if pure_adjective:1630 aux_tran = self.MakeTranAdjective(aux_tran)1631 if pure_adverb:1632 aux_tran = self.MakeTranAdverb(aux_tran)1633 if len(final_translations) >= max_elems: break1634 norm_tran = tkrzw_dict.NormalizeWord(aux_tran)1635 if not norm_tran or norm_tran in uniq_trans:1636 continue1637 uniq_trans.add(norm_tran)1638 final_translations.append(aux_tran)1639 if final_translations:1640 entry["translation"] = final_translations1641 def SetRelations(self, word_entry, entries, word_dicts, live_words, rev_live_words,1642 phrase_prob_dbm, tran_prob_dbm, cooc_prob_dbm, extra_word_bases,1643 verb_words, adj_words, adv_words):1644 word = word_entry["word"]1645 norm_word = tkrzw_dict.NormalizeWord(word)1646 scores = {}1647 def Vote(rel_word, label, weight):1648 values = scores.get(rel_word) or []1649 values.append((weight, label))1650 scores[rel_word] = values1651 synonyms = word_entry.get("_synonym")1652 if synonyms:1653 for synonym in synonyms:1654 Vote(synonym, "meta", 0.1)1655 parents = set()1656 children = set()1657 for label, entry in entries:1658 stems = entry.get("stem")1659 if stems:1660 for stem in stems:1661 parents.add(stem)1662 stem_children = entry.get("stem_child")1663 if stem_children:1664 for child in stem_children:1665 children.add(child)1666 core = entry.get("core")1667 if core:1668 parents.add(core)1669 core_children = entry.get("core_child")1670 if core_children:1671 for child in core_children:1672 children.add(child)1673 bases = entry.get("base")1674 if bases:1675 for base in bases:1676 parents.add(base)1677 base_children = entry.get("base_child")1678 if base_children:1679 for child in base_children:1680 children.add(child)1681 for rel_name, rel_weight in rel_weights.items():1682 ent_rel_words = []1683 expr = entry.get(rel_name)1684 if expr:1685 for rel_word in expr.split(","):1686 rel_word = rel_word.strip()1687 ent_rel_words.append(rel_word)1688 if ent_rel_words:1689 scored_rel_words = []1690 for i, rel_word in enumerate(ent_rel_words):1691 weight = 30 / (min(i, 30) + 30)1692 weight *= rel_weight1693 Vote(rel_word, label, weight)1694 texts = entry.get("text")1695 if texts:1696 base_weight = 1.11697 for text in texts:1698 for field in text[1].split(" [-] "):1699 if not field.startswith("[" + rel_name + "]: "): continue1700 field = regex.sub(r"^[^:]+: ", "", field)1701 field = regex.sub(r"\(.*?\) *", "", field)1702 for i, rel_word in enumerate(field.split(",")):1703 rel_word = rel_word.strip()1704 if rel_word:1705 weight = 30 / (min(i, 30) + 30)1706 weight *= rel_weight * base_weight1707 Vote(rel_word, label, weight)1708 base_weight *= 0.951709 extra_word_base = extra_word_bases.get(word)1710 if extra_word_base:1711 parents.add(extra_word_base)1712 alternatives = word_entry.get("alternative")1713 if alternatives:1714 for alternative in alternatives:1715 if word not in force_parents:1716 parents.discard(alternative)1717 if alternative not in force_parents:1718 children.discard(alternative)1719 for variant in self.GetSpellVariants(word):1720 if word not in force_parents:1721 parents.discard(variant)1722 if variant not in force_parents:1723 children.discard(variant)1724 for child in children:1725 parents.discard(child)1726 if word in no_parents:1727 parents.clear()1728 force_parent = force_parents.get(word)1729 if force_parent:1730 parents.clear()1731 parents.add(force_parent)1732 parents = set([x for x in parents if force_parents.get(x) != word])1733 children = set([x for x in children if x not in no_parents])1734 translations = list(word_entry.get("translation") or [])1735 if tran_prob_dbm:1736 tsv = tran_prob_dbm.GetStr(norm_word)1737 if tsv:1738 fields = tsv.split("\t")1739 for i in range(0, len(fields), 3):1740 src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2])1741 translations.append(trg)1742 translations = set([tkrzw_dict.NormalizeWord(x) for x in translations])1743 rel_words = []1744 for rel_word, votes in scores.items():1745 norm_rel_word = tkrzw_dict.NormalizeWord(rel_word)1746 label_weights = {}1747 for weight, label in votes:1748 old_weight = label_weights.get(label) or 0.01749 label_weights[label] = max(old_weight, weight)1750 total_weight = 01751 for label, weight in label_weights.items():1752 total_weight += weight1753 if tran_prob_dbm:1754 tsv = tran_prob_dbm.GetStr(norm_rel_word)1755 if tsv:1756 bonus = 0.01757 fields = tsv.split("\t")1758 for i in range(0, len(fields), 3):1759 src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2])1760 norm_tran = tkrzw_dict.NormalizeWord(trg)1761 for dict_tran in translations:1762 if dict_tran == norm_tran:1763 bonus = max(bonus, 1.0)1764 elif len(dict_tran) >= 2 and norm_tran.startswith(dict_tran):1765 bonus = max(bonus, 0.3)1766 elif len(norm_tran) >= 2 and dict_tran.startswith(norm_tran):1767 bonus = max(bonus, 0.2)1768 elif len(dict_tran) >= 2 and norm_tran.find(dict_tran) >= 0:1769 bonus = max(bonus, 0.1)1770 elif len(norm_tran) >= 2 and dict_tran.find(norm_tran) >= 0:1771 bonus = max(bonus, 0.1)1772 dist = tkrzw.Utility.EditDistanceLev(dict_tran, norm_tran)1773 dist /= max(len(dict_tran), len(norm_tran))1774 if dist < 0.3:1775 bonus = max(bonus, 0.3)1776 total_weight += bonus1777 score = 1.01778 if phrase_prob_dbm:1779 prob = self.GetPhraseProb(phrase_prob_dbm, "en", rel_word)1780 prob = max(prob, 0.0000001)1781 score += math.exp(-abs(math.log(0.001) - math.log(prob))) * 0.11782 score *= total_weight1783 if tkrzw_dict.IsStopWord("en", norm_rel_word):1784 if tkrzw_dict.IsStopWord("en", norm_word):1785 score *= 0.31786 else:1787 score *= 0.11788 rel_words.append((rel_word, score))1789 rel_words = sorted(rel_words, key=lambda x: x[1], reverse=True)1790 uniq_words = set()1791 final_rel_words = []1792 for rel_word, score in rel_words:1793 if rel_word in parents or rel_word in children:1794 continue1795 if not live_words.Get(rel_word) or rel_word == word:1796 continue1797 norm_rel_word = tkrzw_dict.NormalizeWord(rel_word)1798 if not norm_rel_word: continue1799 if norm_rel_word in uniq_words: continue1800 uniq_words.add(norm_rel_word)1801 hit = False1802 for label, word_dict in word_dicts:1803 if label in self.surfeit_labels: continue1804 if norm_rel_word in word_dict:1805 hit = True1806 break1807 if not hit: continue1808 final_rel_words.append(rel_word)1809 if final_rel_words:1810 max_elems = int(min(max(math.log2(len(word_entry["item"])), 2), 6) * 6)1811 word_entry["related"] = final_rel_words[:max_elems]1812 scored_parents = []1813 for parent in parents:1814 if not live_words.Get(parent) or parent == word:1815 continue1816 prob = self.GetPhraseProb(phrase_prob_dbm, "en", parent)1817 scored_parents.append((parent, prob))1818 scored_parents = sorted(scored_parents, key=lambda x: x[1], reverse=True)1819 #print("SP", word, scored_parents)1820 1821 if scored_parents:1822 word_entry["parent"] = [x[0] for x in scored_parents]1823 scored_children = []1824 for child in children:1825 if not live_words.Get(child) or child == word or child in parents:1826 continue1827 prob = self.GetPhraseProb(phrase_prob_dbm, "en", child)1828 scored_children.append((child, prob))1829 scored_children = sorted(scored_children, key=lambda x: x[1], reverse=True)1830 if scored_children:1831 word_entry["child"] = [x[0] for x in scored_children]1832 prob = float(live_words.GetStr(word) or 0.0)1833 if prob >= 0.000001 and regex.fullmatch(r"[-\p{Latin}]+", word):1834 prefix = word + " "1835 idioms = []1836 it = live_words.MakeIterator()1837 it.Jump(prefix)1838 while True:1839 rec = it.GetStr()1840 if not rec: break1841 cmp_word, cmp_prob = rec1842 if not cmp_word.startswith(prefix): break1843 cmp_prob = float(cmp_prob)1844 cmp_score = cmp_prob / prob1845 if cmp_score >= 0.001:1846 has_particle = False1847 for cmp_token in cmp_word.split(" ")[1:]:1848 if cmp_token in particles:1849 has_particle = True1850 break1851 if has_particle:1852 cmp_score *= 3.01853 if cmp_word in verb_words or cmp_word in adj_words or cmp_word in adv_words:1854 cmp_score *= 3.01855 idioms.append((cmp_word, cmp_score))1856 it.Next()1857 it = rev_live_words.MakeIterator()1858 it.Jump(prefix)1859 while True:1860 rec = it.GetStr()1861 if not rec: break1862 cmp_word, cmp_prob = rec1863 if not cmp_word.startswith(prefix): break1864 cmp_word = " ".join(reversed(cmp_word.split(" ")))1865 cmp_prob = float(cmp_prob)1866 cmp_score = cmp_prob / prob1867 if cmp_score >= 0.001:1868 has_particle = False1869 for cmp_token in cmp_word.split(" ")[:-1]:1870 if cmp_token in particles:1871 has_particle = True1872 break1873 if has_particle:1874 cmp_score *= 3.01875 if cmp_word in verb_words or cmp_word in adj_words or cmp_word in adv_words:1876 cmp_score *= 3.01877 cmp_score * 0.91878 idioms.append((cmp_word, cmp_score))1879 it.Next()1880 idioms = sorted(idioms, key=lambda x: x[1], reverse=True)1881 uniq_idioms = set()1882 final_idioms = []1883 for idiom, prob in idioms:1884 if idiom in uniq_idioms: continue1885 uniq_idioms.add(idiom)1886 final_idioms.append(idiom)1887 if final_idioms:1888 max_elems = int(min(max(math.log2(len(word_entry["item"])), 2), 6) * 4)1889 word_entry["idiom"] = final_idioms[:max_elems]1890 def SetCoocurrences(self, word_entry, entries, word_dicts, phrase_prob_dbm, cooc_prob_dbm):1891 word = word_entry["word"]1892 norm_word = tkrzw_dict.NormalizeWord(word)1893 tokens = self.tokenizer.Tokenize("en", word, True, True)1894 cooc_words = collections.defaultdict(float)1895 max_word_weight = 0.01896 for token in tokens:1897 phrase_prob = self.GetPhraseProb(phrase_prob_dbm, "en", token)1898 word_idf = math.log(phrase_prob) * -11899 word_weight = word_idf ** 21900 max_word_weight = max(max_word_weight, word_weight)1901 tsv = cooc_prob_dbm.GetStr(token)1902 if tsv:1903 for field in tsv.split("\t")[:32]:1904 cooc_word, cooc_prob = field.split(" ", 1)1905 cooc_tokens = self.tokenizer.Tokenize("en", cooc_word, True, True)1906 for cooc_token in cooc_tokens:1907 if cooc_token and cooc_word not in tokens:1908 cooc_words[cooc_token] += float(cooc_prob) * word_weight1909 def_token_labels = collections.defaultdict(set)1910 for item in word_entry["item"]:1911 label = item["label"]1912 if label not in self.full_def_labels: continue1913 text = item["text"]1914 text = regex.sub(r" \[-.*", "", text).strip()1915 if regex.search(r"^\[-.*", text): continue1916 text = regex.sub(r"\(.*?\)", "", text)1917 text = regex.sub(r"\[.*?\]", "", text)1918 if not text: continue1919 def_tokens = self.tokenizer.Tokenize("en", text, True, True)1920 for def_token in def_tokens:1921 if not regex.fullmatch(r"[\p{Latin}]{2,}", def_token): continue1922 if def_token in particles or def_token in misc_stop_words: continue1923 if def_token in tokens: continue1924 def_token_labels[def_token].add(label)1925 for def_token, labels in def_token_labels.items():1926 cooc_words[def_token] += 0.01 * len(labels) * max_word_weight1927 is_wiki_word = "wikipedia" in cooc_words or "encyclopedia" in cooc_words1928 merged_cooc_words = sorted(cooc_words.items(), key=lambda x: x[1], reverse=True)1929 weighed_cooc_words = []1930 for cooc_word, cooc_score in merged_cooc_words:1931 cooc_prob = self.GetPhraseProb(phrase_prob_dbm, "en", cooc_word)1932 cooc_idf = math.log(cooc_prob) * -11933 cooc_score *= cooc_idf ** 21934 if tkrzw_dict.IsStopWord("en", cooc_word):1935 if tkrzw_dict.IsStopWord("en", norm_word):1936 cooc_score *= 0.31937 else:1938 cooc_score *= 0.11939 elif cooc_word in particles or cooc_word in misc_stop_words:1940 cooc_score *= 0.51941 elif is_wiki_word and cooc_word in wiki_stop_words:1942 cooc_score *= 0.21943 weighed_cooc_words.append((cooc_word, cooc_score))1944 sorted_cooc_words = sorted(weighed_cooc_words, key=lambda x: x[1], reverse=True)1945 final_cooc_words = []1946 for cooc_word, cooc_score in sorted_cooc_words:1947 if len(final_cooc_words) >= 16: break1948 hit = False1949 for label, word_dict in word_dicts:1950 if label in self.surfeit_labels: continue1951 if cooc_word in word_dict:1952 hit = True1953 break1954 if not hit: continue1955 final_cooc_words.append(cooc_word)1956 if final_cooc_words:1957 word_entry["cooccurrence"] = final_cooc_words1958 def CompensateInflections(self, entry, merged_dict, verb_words):1959 word = entry["word"]1960 root_verb = None1961 ing_value = entry.get("verb_present_participle")1962 if ing_value and ing_value.endswith("<ing"):1963 root_verb = ing_value[:-4]1964 for infl_name in inflection_names:1965 value = entry.get(infl_name)1966 if value and not regex.fullmatch(r"[-\p{Latin}0-9', ]+", value):1967 del entry[infl_name]1968 poses = set()1969 for item in entry["item"]:1970 poses.add(item["pos"])1971 if "verb" in poses and word.find(" ") >= 0 and not regex.search(r"[A-Z]", word):1972 tokens = self.tokenizer.Tokenize("en", word, False, False)1973 if len(tokens) > 1:1974 if not root_verb:1975 for token in tokens:1976 if token not in particles and token not in misc_stop_words and token in verb_words:1977 root_verb = token1978 break1979 if root_verb:1980 root_entry = merged_dict.get(root_verb)1981 if root_entry:1982 for infl_name in inflection_names:1983 if not infl_name.startswith("verb_") or entry.get(infl_name):1984 continue1985 root_infls = root_entry[0].get(infl_name)1986 if not root_infls:1987 continue1988 phrase_infls = []1989 for root_infl in regex.split(r"[,|]", root_infls):1990 root_infl = root_infl.strip()1991 if not root_infl: continue1992 root_infl_tokens = []1993 for token in tokens:1994 if root_infl and token == root_verb:1995 root_infl_tokens.append(root_infl)1996 root_infl = None1997 else:1998 root_infl_tokens.append(token)1999 phrase_infls.append(" ".join(root_infl_tokens))2000 if phrase_infls:2001 entry[infl_name] = ", ".join(phrase_infls)2002 def CompensateAlternatives(self, word_entry, merged_dict):2003 word = word_entry["word"]2004 alternatives = word_entry.get("alternative") or []2005 variants = self.GetSpellVariants(word)2006 wn_count = 02007 for item in word_entry["item"]:2008 if item["label"] != "wn": continue2009 wn_count += 12010 for section in item["text"].split("[-]"):2011 section = section.strip()2012 match = regex.search(r"\[synonym\]: (.*)", section)2013 if match:2014 for synonym in match.group(1).split(","):2015 synonym = synonym.strip()2016 dist = tkrzw.Utility.EditDistanceLev(word, synonym)2017 similar = False2018 if dist == 1 and word[:3] != synonym[:3]:2019 similar = True2020 elif dist == 2 and word[:5] == synonym[:5] and word[-2:] == synonym[-2:]:2021 similar = True2022 if similar and synonym not in variants:2023 variants.add(synonym)2024 for variant in variants:2025 if word[:2] != variant[:2]: continue2026 if variant in alternatives: continue2027 variant_entries = merged_dict.get(variant)2028 if not variant_entries: continue2029 for variant_entry in variant_entries:2030 if variant_entry["word"] != variant: continue2031 var_wn_count = 02032 var_wn_counts = collections.defaultdict(int)2033 for item in variant_entry["item"]:2034 if item["label"] != "wn": continue2035 var_wn_count += 12036 for section in item["text"].split("[-]"):2037 section = section.strip()2038 match = regex.search(r"\[synonym\]: (.*)", section)2039 if match:2040 for synonym in match.group(1).split(","):2041 synonym = synonym.strip()2042 if synonym:2043 var_wn_counts[synonym] += 12044 hits = var_wn_counts[word]2045 if (wn_count > 0 and var_wn_count == wn_count and hits == wn_count) or hits >= 4:2046 alternatives.append(variant)2047 if alternatives:2048 word_entry["alternative"] = alternatives2049 def GetSpellVariants(self, word):2050 variants = set()2051 suffix_pairs = [("se", "ze"), ("sing", "zing"), ("sed", "zed"), ("ser", "zer"),2052 ("sation", "zation"), ("ce", "se"),2053 ("our", "or"), ("og", "ogue"), ("re", "er"), ("l", "ll")]2054 for suffix1, suffix2 in suffix_pairs:2055 if word.endswith(suffix1):2056 variant = word[:-len(suffix1)] + suffix22057 variants.add(variant)2058 if word.endswith(suffix2):2059 variant = word[:-len(suffix2)] + suffix12060 variants.add(variant)2061 return variants2062 def GetEntryTranslations(self, merged_dict, word, is_capital, best_pos):2063 key = tkrzw_dict.NormalizeWord(word)2064 entry = merged_dict.get(key)2065 if not entry: return None2066 scored_trans = []2067 word_score = 1.02068 for word_entry in entry:2069 cmp_word = word_entry["word"]2070 if bool(regex.search(r"\p{Lu}", cmp_word)) != is_capital:2071 continue2072 item_score = 1.02073 for item in word_entry["item"]:2074 pos = item["pos"]2075 text = item["text"]2076 trans = self.ExtractTextLabelTrans(text)2077 if trans:2078 score = word_score * item_score2079 if pos == best_pos:2080 score *= 2.02081 for tran in trans:2082 scored_trans.append((tran, score))2083 score *= 0.92084 item_score *= 0.92085 trans = word_entry.get("translation")2086 if trans:2087 score = word_score * item_score2088 for tran in trans:2089 scored_trans.append((tran, score))2090 score *= 0.92091 word_score *= 0.52092 scored_trans = sorted(scored_trans, key=lambda x: x[1], reverse=True)2093 return [x[0] for x in scored_trans]2094 def PropagateTranslations(self, entry, merged_dict, tran_prob_dbm, aux_last_trans):2095 old_trans = entry.get("translation") or []2096 if len(old_trans) >= 8: return2097 word = entry["word"]2098 is_capital = bool(regex.search(r"\p{Lu}", word))2099 if len(word) <= 2: return2100 uniq_labels = set()2101 top_exprs = []2102 poses = set()2103 synonyms = []2104 for item in entry["item"]:2105 label = item["label"]2106 pos = item["pos"]2107 poses.add(pos)2108 if label in self.gross_labels or label in self.supplement_labels: continue2109 is_first = label not in uniq_labels2110 uniq_labels.add(label)2111 text = item["text"]2112 for field in text.split(" [-] "):2113 if not field.startswith("[synonym]: "): continue2114 field = regex.sub(r"^[^:]+: ", "", field)2115 field = regex.sub(r"\(.*?\) *", "", field)2116 for synonym in field.split(","):2117 synonym = synonym.strip()2118 if synonym:2119 synonyms.append((synonym, pos))2120 text = regex.sub(r" \[-+\] .*", "", text)2121 text = regex.sub(r"\(.*?\)", "", text)2122 text = regex.sub(r"\.$", "", text)2123 text = regex.sub(r"([-\p{Latin}\d]{5,})\.", r"\1;", text)2124 for expr in text.split(";"):2125 expr = expr.strip()2126 if pos == "verb":2127 expr = regex.sub(r"^to +([\p{Latin}])", r"\1", expr, flags=regex.IGNORECASE)2128 elif pos == "noun":2129 expr = regex.sub(r"^(a|an|the) +([\p{Latin}])", r"\2", expr, flags=regex.IGNORECASE)2130 if expr:2131 top_exprs.append((expr, pos, is_first))2132 top_words = []2133 for expr, pos, is_first in top_exprs:2134 manner_match = regex.search(r"^in +([-\p{Latin}].*?) +(manner|fashion|way)$",2135 expr, regex.IGNORECASE)2136 preps = ["of", "in", "at", "from", "by", "part of", "out of", "inside",2137 "relating to", "related to", "associated with",2138 "characterized by", "pertaining to", "derived from"]2139 prep_expr = None2140 for prep in preps:2141 if len(expr) > len(prep):2142 if expr[:len(prep)].lower() == prep:2143 expr_lead = expr[len(prep):]2144 joint_match = regex.match(r"^,?( +or)? +", expr_lead)2145 if joint_match:2146 expr = expr_lead[joint_match.end():]2147 prep_expr = expr2148 if manner_match:2149 expr = manner_match.group(1).strip()2150 expr = regex.sub(r"^(a|an|the) +", "", expr, flags=regex.IGNORECASE)2151 if expr:2152 top_words.append((expr, "adjective", "adverb", is_first))2153 elif prep_expr:2154 expr = regex.sub(r"^(a|an|the) +([\p{Latin}])", r"\2", prep_expr, flags=regex.IGNORECASE)2155 if expr:2156 new_pos = "adverb" if pos == "adverb" else "adjective"2157 top_words.append((expr, "noun", new_pos, is_first))2158 else:2159 expr = expr.strip()2160 if expr:2161 top_words.append((expr, pos, "", is_first))2162 etym_prefix = entry.get("etymology_prefix")2163 etym_core = entry.get("etymology_core")2164 etym_suffix = entry.get("etymology_suffix")2165 if ("noun" in poses and not etym_prefix and etym_core and2166 etym_suffix in ("ness", "cy", "ity")):2167 top_words.append((etym_core, "adjective", "noun", True))2168 if ("noun" in poses and not etym_prefix and etym_core and2169 etym_suffix in ("ment", "tion", "sion")):2170 top_words.append((etym_core, "verb", "noun", True))2171 if ("verb" in poses and not etym_prefix and etym_core and2172 etym_suffix in ("ise", "ize")):2173 top_words.append((etym_core, "adjective", "verb", True))2174 if ("adjective" in poses and not etym_prefix and etym_core2175 and etym_suffix in ("ic", "ical", "ish", "ly")):2176 top_words.append((etym_core, "noun", "adjective", True))2177 if ("adverb" in poses and not etym_prefix and etym_core and2178 etym_suffix == "ly"):2179 top_words.append((etym_core, "adjective", "adverb", True))2180 parents = entry.get("parent")2181 if parents:2182 for parent in parents:2183 if len(parent) < 5: continue2184 if ("noun" in poses and2185 (word.endswith("ness") or word.endswith("cy") or word.endswith("ity"))):2186 top_words.append((parent, "adjective", "noun", True))2187 if ("noun" in poses and2188 (word.endswith("ment") or word.endswith("tion") or word.endswith("sion"))):2189 top_words.append((parent, "verb", "noun", True))2190 if ("verb" in poses and2191 (word.endswith("ise") or word.endswith("tze"))):2192 top_words.append((parent, "adjective", "verb", True))2193 if ("adjective" in poses and2194 (word.endswith("ic") or word.endswith("ical") or word.endswith("ish"))):2195 top_words.append((parent, "noun", "adjective", True))2196 if ("adverb" in poses and2197 word.endswith("ly")):2198 top_words.append((parent, "adjective", "adverb", True))2199 ent_synonyms = entry.get("_synonym")2200 if ent_synonyms:2201 for synonym in ent_synonyms:2202 norm_synonym = tkrzw_dict.NormalizeWord(synonym)2203 syn_entries = merged_dict.get(norm_synonym)2204 if syn_entries:2205 syn_pos = ""2206 for syn_entry in syn_entries:2207 if syn_entry["word"] != synonym: continue2208 for syn_item in syn_entry["item"]:2209 if syn_item["pos"] in poses:2210 syn_pos = syn_item["pos"]2211 break2212 if syn_pos:2213 synonyms.append((synonym, syn_pos))2214 for synonym, pos in synonyms:2215 top_words.append((synonym, pos, "", False))2216 trans = []2217 tran_sources = set()2218 for expr, pos, conversion, trustable in top_words:2219 expr = regex.sub(r"^([-\p{Latin}]+), ([-\p{Latin}]+),? +or +([-\p{Latin}]+)$",2220 r"\1; \2; \3", expr)2221 expr = regex.sub(r"^([-\p{Latin}]+) +or +([-\p{Latin}]+)$", r"\1; \2", expr)2222 expr = regex.sub(r"^([-\p{Latin}]+), +([-\p{Latin}]+)$", r"\1; \2", expr)2223 for rel_word in expr.split(";"):2224 rel_word = rel_word.strip()2225 if len(rel_word) <= 2: continue2226 word_trans = self.GetEntryTranslations(merged_dict, rel_word, is_capital, pos)2227 if not word_trans: continue2228 new_pos = conversion or pos2229 if new_pos == "noun":2230 word_trans = [self.MakeTranNoun(x) for x in word_trans]2231 elif new_pos == "verb":2232 word_trans = [self.MakeTranVerb(x) for x in word_trans]2233 elif new_pos == "adjective":2234 word_trans = [self.MakeTranAdjective(x) for x in word_trans]2235 elif new_pos == "adverb":2236 word_trans = [self.MakeTranAdverb(x) for x in word_trans]2237 for rank, word_tran in enumerate(word_trans):2238 tran_source = (word_tran, rel_word)2239 if tran_source in tran_sources: continue2240 tran_sources.add(tran_source)2241 trans.append((word_tran, trustable, rel_word, rank))2242 prob_trans = {}2243 key = tkrzw_dict.NormalizeWord(word)2244 tsv = tran_prob_dbm.GetStr(key)2245 if tsv:2246 fields = tsv.split("\t")2247 for i in range(0, len(fields), 3):2248 src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2])2249 if regex.search("[ã£ã]$", trg) and self.tokenizer.GetJaLastPos(trg)[1] == "åè©":2250 continue2251 norm_trg = tkrzw_dict.NormalizeWord(trg)2252 prob = float(prob)2253 if src != word:2254 prob *= 0.12255 prob_trans[norm_trg] = max(prob_trans.get(norm_trg) or 0.0, prob)2256 scored_trans = []2257 tran_counts = {}2258 for tran, trustable, rel_word, rank in trans:2259 tran_counts[tran] = (tran_counts.get(tran) or 0) + 12260 for tran, trustable, rel_word, rank in trans:2261 norm_tran = tkrzw_dict.NormalizeWord(tran)2262 max_weight = 02263 prob_hit = False2264 for prob_tran, prob in prob_trans.items():2265 prob **= 0.252266 dist = tkrzw.Utility.EditDistanceLev(norm_tran, prob_tran)2267 dist /= max(len(norm_tran), len(prob_tran))2268 weight = prob ** 0.5 + 2.0 - dist2269 if norm_tran == prob_tran:2270 weight *= 102271 prob_hit = True2272 elif len(prob_tran) >= 2 and norm_tran.startswith(prob_tran):2273 weight *= 52274 prob_hit = True2275 elif len(norm_tran) >= 2 and prob_tran.startswith(norm_tran):2276 weight *= 52277 prob_hit = True2278 elif len(prob_tran) >= 2 and norm_tran.find(prob_tran) >= 0:2279 weight *= 32280 prob_hit = True2281 elif len(norm_tran) >= 2 and prob_tran.find(norm_tran) >= 0:2282 weight *= 32283 prob_hit = True2284 elif dist < 0.3:2285 weight *= 22286 prob_hit = True2287 max_weight = max(max_weight, weight)2288 if not trustable and not prob_hit:2289 continue2290 tran_count = tran_counts[tran]2291 count_score = 1 + (tran_count * 0.2)2292 rank_score = 0.95 ** rank2293 score = max_weight * count_score * rank_score2294 scored_trans.append((tran, score, prob_hit))2295 scored_trans = sorted(scored_trans, key=lambda x: x[1], reverse=True)2296 rec_aux_trans = aux_last_trans.get(word)2297 if rec_aux_trans:2298 scored_aux_trans = []2299 for aux_tran in rec_aux_trans:2300 norm_trg = tkrzw_dict.NormalizeWord(aux_tran)2301 prob = prob_trans.get(norm_trg) or 0.02302 prob += 0.01 / (len(aux_tran) + 1)2303 scored_aux_trans.append((aux_tran, prob))2304 scored_aux_trans = sorted(scored_aux_trans, key=lambda x: x[1], reverse=True)2305 for aux_tran, score in scored_aux_trans:2306 scored_trans.append((aux_tran, 0, False))2307 final_trans = []2308 uniq_trans = set()2309 for tran in old_trans:2310 norm_tran = tkrzw_dict.NormalizeWord(tran)2311 uniq_trans.add(norm_tran)2312 final_trans.append(tran)2313 num_rank = 02314 for tran, score, prob_hit in scored_trans:2315 if len(final_trans) >= 8: break2316 norm_tran = tkrzw_dict.NormalizeWord(tran)2317 if norm_tran in uniq_trans: continue2318 num_rank += 12319 if not prob_hit:2320 if num_rank > 3: continue2321 if num_rank > 2 and len(final_trans) >= 3: continue2322 uniq_trans.add(norm_tran)2323 final_trans.append(tran)2324 if final_trans:2325 entry["translation"] = final_trans2326 def MakeTranNoun(self, tran):2327 pos = self.tokenizer.GetJaLastPos(tran)2328 stem = self.tokenizer.CutJaWordNounParticle(tran)2329 if tran.endswith("ãã"):2330 tran = tran[:-2]2331 elif tran.endswith("ããã"):2332 tran = tran[:-3]2333 elif tran.endswith("ããã"):2334 tran = tran[:-3]2335 elif tran.endswith("ãããª"):2336 tran = tran[:-3]2337 elif self.tokenizer.IsJaWordAdjvNoun(stem):2338 tran = stem2339 elif tran.endswith("ã") and pos[1] == "形容è©":2340 tran = tran[:-1] + "ã"2341 elif pos[1] in "åè©" and regex.search(r"[ãããã¤ã¬ãµããã]$", tran):2342 tran = tran + "ãã¨"2343 elif pos[1] in "形容è©" and regex.search(r"[ãã]$", tran):2344 tran = tran + "ãã¨"2345 elif pos[0] in ("ã", "ãª") and pos[1] == "å©åè©":2346 tran = tran + "ãã¨"2347 return tran2348 def MakeTranVerb(self, tran):2349 pos = self.tokenizer.GetJaLastPos(tran)2350 if self.tokenizer.IsJaWordSahenNoun(tran):2351 tran = tran + "ãã"2352 elif tran.endswith("ã") and pos[1] == "形容è©":2353 tran = tran[:-1] + "ããã"2354 elif pos[1] == "åè©" and pos[2] == "形容åè©èªå¹¹":2355 tran = tran + "ã«ãã"2356 return tran2357 def MakeTranAdjective(self, tran):2358 pos = self.tokenizer.GetJaLastPos(tran)2359 stem = self.tokenizer.CutJaWordNounParticle(tran)2360 is_adjv = False2361 if tran.endswith("ãã"):2362 tran = tran[:-2]2363 elif tran.endswith("ããã"):2364 tran = tran[:-3]2365 elif tran.endswith("ããã"):2366 tran = tran[:-3]2367 elif tran.endswith("ãããª"):2368 tran = tran[:-3]2369 elif self.tokenizer.IsJaWordAdjvNoun(stem):2370 tran = stem2371 is_adjv = True2372 pos = self.tokenizer.GetJaLastPos(tran)2373 if self.tokenizer.IsJaWordAdjvNounOnly(tran):2374 tran += "ãª"2375 elif pos[1] == "åè©":2376 if tran.endswith("ç"):2377 tran += "ãª"2378 else:2379 tran += "ã®"2380 return tran2381 def MakeTranAdverb(self, tran):2382 pos = self.tokenizer.GetJaLastPos(tran)2383 stem = self.tokenizer.CutJaWordNounParticle(tran)2384 if tran.endswith("ãã"):2385 tran = tran[:-2] + "ãã¦"2386 elif tran.endswith("ããã"):2387 tran = tran[:-3] + "ããã¦"2388 elif tran.endswith("ããã"):2389 tran = tran[:-3] + "ããã¦"2390 elif tran.endswith("ãããª"):2391 tran = tran[:-3] + "ããã«"2392 elif tran.endswith("ããã"):2393 tran = tran[:-3] + "ããã"2394 elif tran.endswith("ã¨ãã"):2395 tran = tran[:-3] + "ã¨ãã¦"2396 elif tran.endswith("ã") and pos[1] == "形容è©":2397 tran = tran[:-1] + "ã"2398 elif tran.endswith("çãª"):2399 tran = tran[:-1] + "ã«"2400 elif self.tokenizer.IsJaWordSahenNoun(stem):2401 tran = stem + "ãã¦"2402 elif self.tokenizer.IsJaWordAdjvNoun(stem):2403 tran = stem + "ã«"2404 elif stem != tran or pos[1] == "åè©":2405 tran = stem + "ã§"2406 elif pos[0] == "ã" and pos[1] == "å©åè©":2407 tran = tran[:-1] + "ã¦"2408 elif pos[1] == "åè©":2409 tran = stem + "ããã«"2410 return tran2411 def SetPhraseTranslations(self, entry, merged_dict, aux_trans, aux_last_trans,2412 tran_prob_dbm, phrase_prob_dbm, noun_words, verb_words,2413 live_words, rev_live_words):2414 if not tran_prob_dbm or not phrase_prob_dbm:2415 return2416 word = entry["word"]2417 if not regex.fullmatch(r"[-\p{Latin}]+", word):2418 return2419 if len(word) < 2 or word in ("an", "the"):2420 return2421 is_noun = word in noun_words2422 is_verb = word in verb_words2423 word_prob = float(phrase_prob_dbm.GetStr(word) or 0.0)2424 if word_prob < 0.00001:2425 return2426 word_mod_prob = min(word_prob, 0.001)2427 norm_word = " ".join(self.tokenizer.Tokenize("en", word, True, True))2428 if word != norm_word:2429 return2430 phrases = []2431 for particle in particles:2432 phrase = word + " " + particle2433 phrase_prob = float(phrase_prob_dbm.GetStr(phrase) or 0.0)2434 ratio = phrase_prob / word_mod_prob2435 if is_verb and ratio >= 0.005:2436 for pron in ("me", "us", "you", "him", "her", "it", "them"):2437 pron_phrase = word + " " + pron + " " + particle2438 pron_phrase_prob = float(phrase_prob_dbm.GetStr(pron_phrase) or 0.0)2439 if pron_phrase_prob > 0.0:2440 phrase_prob += pron_phrase_prob * 2.02441 ratio = phrase_prob / word_mod_prob2442 phrases.append((phrase, True, ratio, ratio, phrase_prob))2443 if ratio >= 0.005:2444 for sub_particle in particles:2445 sub_phrase = phrase + " " + sub_particle2446 sub_phrase_prob = float(phrase_prob_dbm.GetStr(sub_phrase) or 0.0)2447 sub_ratio = max(sub_phrase_prob / phrase_prob, 0.01)2448 phrases.append((sub_phrase, True, max(sub_ratio, ratio),2449 ratio * (sub_ratio ** 0.005), sub_phrase_prob))2450 verb_prob = 0.02451 if is_verb:2452 for auxverb in ("not", "will", "shall", "can", "may", "must"):2453 auxverb_prob = float(phrase_prob_dbm.GetStr(auxverb + " " + word) or 0.0)2454 verb_prob += auxverb_prob2455 verb_prob *= 202456 for particle in particles:2457 phrase = particle + " " + word2458 phrase_prob = float(phrase_prob_dbm.GetStr(phrase) or 0.0)2459 if particle == "to":2460 phrase_prob -= verb_prob2461 ratio = phrase_prob / word_mod_prob2462 phrases.append((phrase, False, ratio, ratio, phrase_prob))2463 if is_noun:2464 for art in ("the", "a", "an"):2465 sub_phrase = particle + " " + art + " " + word2466 sub_phrase_prob = float(phrase_prob_dbm.GetStr(sub_phrase) or 0.0)2467 sub_ratio = sub_phrase_prob / word_mod_prob2468 phrases.append((sub_phrase, False, sub_ratio, sub_ratio, sub_phrase_prob))2469 it = live_words.MakeIterator()2470 it.Jump(word + " ")2471 while True:2472 rec = it.GetStr()2473 if not rec: break2474 phrase, phrase_prob = rec2475 if not phrase.startswith(word + " "): break2476 phrase_prob = float(phrase_prob)2477 ratio = phrase_prob / word_prob2478 if ratio >= 0.05:2479 phrases.append((phrase, True, ratio, ratio, phrase_prob))2480 it.Next()2481 it = rev_live_words.MakeIterator()2482 it.Jump(word + " ")2483 while True:2484 rec = it.GetStr()2485 if not rec: break2486 phrase, phrase_prob = rec2487 if not phrase.startswith(word + " "): break2488 phrase_prob = float(phrase_prob)2489 ratio = phrase_prob / word_prob2490 if ratio >= 0.05:2491 phrase = " ".join(reversed(phrase.split(" ")))2492 phrases.append((phrase, True, ratio, ratio, phrase_prob))2493 it.Next()2494 if not phrases:2495 return2496 orig_trans = {}2497 tsv = tran_prob_dbm.GetStr(word)2498 if tsv:2499 fields = tsv.split("\t")2500 for i in range(0, len(fields), 3):2501 src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2])2502 trg = regex.sub(r"[ï½ã]", "", trg)2503 trg, trg_prefix, trg_suffix = self.tokenizer.StripJaParticles(trg)2504 if src == word and prob >= 0.06:2505 orig_trans[trg] = prob2506 aux_orig_trans = (aux_trans.get(word) or []) + (aux_last_trans.get(word) or [])2507 if aux_orig_trans:2508 for trg in set(aux_orig_trans):2509 trg = regex.sub(r"[ï½ã]", "", trg)2510 trg, trg_prefix, trg_suffix = self.tokenizer.StripJaParticles(trg)2511 orig_trans[trg] = float(orig_trans.get(trg) or 0) + 0.12512 ent_orig_trans = entry.get("translation")2513 if ent_orig_trans:2514 base_score = 0.12515 for ent_orig_tran in ent_orig_trans:2516 orig_trans[ent_orig_tran] = float(orig_trans.get(ent_orig_tran) or 0) + base_score2517 base_score *= 0.92518 final_phrases = []2519 uniq_phrases = set()2520 for phrase, is_suffix, mod_prob, phrase_score, raw_prob in phrases:2521 if phrase in uniq_phrases: continue2522 uniq_phrases.add(phrase)2523 phrase_trans = {}2524 phrase_prefixes = {}2525 pos_match = is_verb if is_suffix else is_noun2526 if mod_prob >= 0.02:2527 if pos_match:2528 tsv = tran_prob_dbm.GetStr(phrase)2529 if tsv:2530 fields = tsv.split("\t")2531 for i in range(0, len(fields), 3):2532 src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2])2533 if src != phrase:2534 continue2535 if regex.search("[ã£ã]$", trg) and self.tokenizer.GetJaLastPos(trg)[1] == "åè©":2536 continue2537 if (is_verb and regex.search("[ãããã¡ã«ã²ã¿ã]$", trg) and2538 self.tokenizer.GetJaLastPos(trg)[1] == "åè©"):2539 continue2540 trg = regex.sub(r"[ï½ã]", "", trg)2541 trg, trg_prefix, trg_suffix = self.tokenizer.StripJaParticles(trg)2542 if not trg or regex.fullmatch(r"[\p{Katakana}ã¼]+", trg):2543 continue2544 pos = self.tokenizer.GetJaLastPos(trg)2545 if (is_noun and is_suffix and pos[1] == "åè©" and2546 not self.tokenizer.IsJaWordSahenNoun(trg)):2547 continue2548 if is_noun and is_suffix and trg in ("ãã", "ãã", "ã§ã", "ã¾ã"):2549 continue2550 orig_prob = orig_trans.get(trg) or 0.02551 if is_verb:2552 if self.tokenizer.IsJaWordSahenNoun(trg):2553 orig_prob = max(orig_prob, orig_trans.get(trg + "ãã") or 0.0)2554 for ext_suffix in ("ãã", "ãã", "ãã¦", "ããã", "ããã", "ããã¦"):2555 orig_prob = max(orig_prob, orig_trans.get(trg[:len(ext_suffix)]) or 0.0)2556 if (is_suffix and is_verb and not trg_prefix and trg_suffix and2557 (pos[1] == "åè©" or self.tokenizer.IsJaWordSahenNoun(trg))):2558 trg_prefix = trg_suffix2559 trg_suffix = ""2560 elif is_suffix and is_noun and not trg_prefix:2561 if trg_suffix == "ã®ãã":2562 trg_suffix = "ããã®"2563 trg_prefix = trg_suffix2564 trg_suffix = ""2565 elif not trg_suffix and trg_prefix in ("ããã®", "ã®ãã"):2566 if trg.endswith("ãã"):2567 trg += "ããã®"2568 else:2569 trg += "ã®ãã"2570 trg_prefix = ""2571 elif trg_suffix:2572 trg += trg_suffix2573 sum_prob = orig_prob + prob2574 if sum_prob >= 0.1:2575 if is_verb and pos[1] == "åè©":2576 sum_prob += 0.12577 phrase_trans[trg] = float(phrase_trans.get(trg) or 0.0) + sum_prob2578 if trg_prefix and not trg_suffix:2579 part_key = trg + ":" + trg_prefix2580 phrase_prefixes[part_key] = float(phrase_trans.get(part_key) or 0.0) + sum_prob2581 for aux_phrase_trans in (aux_trans.get(phrase), aux_last_trans.get(phrase)):2582 if aux_phrase_trans:2583 for trg in aux_phrase_trans:2584 trg = regex.sub(r"[ï½ã]", "", trg)2585 trg, trg_prefix, trg_suffix = self.tokenizer.StripJaParticles(trg)2586 if is_noun and is_suffix and trg in ("ãã", "ãã", "ã§ã", "ã¾ã"):2587 continue2588 phrase_trans[trg] = float(phrase_trans.get(trg) or 0.0) + 0.12589 if mod_prob >= 0.001:2590 phrase_entries = merged_dict.get(phrase)2591 if phrase_entries:2592 for phrase_entry in phrase_entries:2593 if phrase_entry["word"] != phrase: continue2594 ent_phrase_trans = phrase_entry.get("translation")2595 if ent_phrase_trans:2596 base_score = 0.152597 for trg in ent_phrase_trans:2598 trg, trg_prefix, trg_suffix = self.tokenizer.StripJaParticles(trg)2599 phrase_trans[trg] = float(phrase_trans.get(trg) or 0.0) + base_score2600 if trg_prefix and not trg_suffix:2601 part_key = trg + ":" + trg_prefix2602 phrase_prefixes[part_key] = float(phrase_trans.get(part_key) or 0.0) + base_score2603 base_score *= 0.92604 if not phrase_trans:2605 continue2606 for tran in list(phrase_trans.keys()):2607 if not regex.search(r"[\p{Han}\p{Katakana}]", tran):2608 continue2609 for cmp_tran, cmp_score in list(phrase_trans.items()):2610 if cmp_tran not in phrase_trans: continue2611 if cmp_tran.startswith(tran):2612 suffix = cmp_tran[len(tran):]2613 if suffix in ("ãã", "ããã", "ããã", "ã«", "ãª", "ã®"):2614 phrase_trans[cmp_tran] = cmp_score + float(phrase_trans.get(tran) or 0)2615 if tran in phrase_trans:2616 del phrase_trans[tran]2617 mod_trans = {}2618 for tran, score in phrase_trans.items():2619 prefix_check = tran + ":"2620 best_prefix = ""2621 best_prefix_score = 0.02622 for prefix, score in phrase_prefixes.items():2623 if not prefix.startswith(prefix_check): continue2624 if score >= best_prefix_score:2625 best_prefix = prefix[len(prefix_check):]2626 best_prefix_score = score2627 if regex.search(r"^[\p{Katakana}ã¼]", tran):2628 score *= 0.52629 pos = self.tokenizer.GetJaLastPos(tran)2630 if is_suffix and is_verb:2631 if pos[1] == "åè©" and regex.search("[ãããã¤ã¬ãµããã]$", tran):2632 score *= 1.52633 if pos[1] == "åè©" and not self.tokenizer.IsJaWordSahenNoun(tran):2634 score *= 0.52635 if not is_suffix and pos[1] == "åè©" and not best_prefix:2636 if self.tokenizer.IsJaWordSahenNoun(tran) or self.tokenizer.IsJaWordAdjvNoun(tran):2637 score *= 0.72638 else:2639 score *= 0.52640 if len(tran) <= 1:2641 score *= 0.52642 if is_verb:2643 orig_tran = tran2644 pos = self.tokenizer.GetJaLastPos(tran)2645 if self.tokenizer.IsJaWordSahenNoun(tran) and best_prefix != "ã®":2646 tran = tran + "ãã"2647 if best_prefix and best_prefix not in ("ã", "ã", "ã¯"):2648 tran = "({}){}".format(best_prefix, tran)2649 mod_trans[tran] = float(mod_trans.get(tran) or 0.0) + score2650 scored_trans = sorted(mod_trans.items(), key=lambda x: x[1], reverse=True)[:4]2651 if scored_trans:2652 final_phrases.append((phrase, phrase_score, raw_prob, [x[0] for x in scored_trans]))2653 if final_phrases:2654 final_phrases = sorted(final_phrases, key=lambda x: x[1], reverse=True)2655 map_phrases = []2656 for phrase, score, raw_prob, trans in final_phrases:2657 prob_expr = "{:.6f}".format(raw_prob / word_prob).replace("0.", ".")2658 map_phrase = {"w": phrase, "p": prob_expr, "x": trans}2659 if phrase in merged_dict:2660 map_phrase["i"] = "1"2661 map_phrases.append(map_phrase)2662 entry["phrase"] = map_phrases2663 def FilterParents(self, word_entry, merged_dict):2664 word = word_entry["word"]2665 parents = word_entry.get("parent")2666 if not parents or len(parents) < 2: return2667 ancestors = parents2668 while True:2669 grand_ancestors = []2670 for ancestor in ancestors:2671 ancestor_entries = merged_dict.get(ancestor)2672 if ancestor_entries:2673 for ancestor_entry in ancestor_entries:2674 if ancestor_entry["word"] != ancestor: continue2675 for grand_ancestor in ancestor_entry.get("parent") or []:2676 if grand_ancestor in ancestors and grand_ancestor not in grand_ancestors:2677 grand_ancestors.append(grand_ancestor)2678 if not grand_ancestors or len(grand_ancestors) == len(ancestors):2679 break2680 ancestors = grand_ancestors2681 scored_parents = []2682 for parent in parents:2683 parent_prob = 02684 parent_entries = merged_dict.get(parent)2685 if parent_entries:2686 for parent_entry in parent_entries:2687 if parent_entry["word"] != parent: continue2688 parent_prob = float(parent_entry.get("probability")) or 02689 score = parent_prob + 0.0000012690 if word.startswith(parent):2691 score *= 22692 if parent in ancestors:2693 score += 12694 else:2695 is_dup = False2696 for suffix in ("ing", "ed", "er", "or", "ism", "ist", "est"):2697 for ancestor in ancestors:2698 candidate = ancestor + suffix2699 if (parent[:3] == candidate[:3] and2700 tkrzw.Utility.EditDistanceLev(parent, candidate) < 2):2701 is_dup = True2702 if is_dup:2703 continue2704 scored_parents.append((parent, score))2705 scored_parents = sorted(scored_parents, key=lambda x: x[1], reverse=True)2706 word_entry["parent"] = [x[0] for x in scored_parents]2707 def AbsorbInflections(self, word_entry, merged_dict):2708 word = word_entry["word"]2709 infls = []2710 for infl_name in inflection_names:2711 infl_value = word_entry.get(infl_name)2712 if infl_value:2713 for infl in infl_value.split(","):2714 infl = infl.strip()2715 if infl and infl != word and infl not in infls:2716 infls.append(infl)2717 phrases = []2718 for infl in infls:2719 infl_entries = merged_dict.get(infl)2720 if not infl_entries: continue2721 for infl_entry in infl_entries:2722 if infl_entry["word"] != infl: continue2723 is_core = False2724 good_labels = set()2725 num_good_items = 02726 for infl_item in infl_entry["item"]:2727 label = infl_item["label"]2728 text = infl_item["text"]2729 if label in self.supplement_labels: continue2730 if regex.search(r"^\[\w+]:", text): continue2731 good_labels.add(label)2732 if label in self.core_labels:2733 is_core = True2734 num_good_items += 12735 alive = True2736 if len(good_labels) < 2 and not is_core and num_good_items < 3:2737 infl_entry["deleted"] = True2738 alive = False2739 infl_trans = infl_entry.get("translation")2740 if infl_trans:2741 phrase = {"w": infl, "x": infl_trans[:4]}2742 if alive:2743 phrase["i"] = "1"2744 phrases.append(phrase)2745 if phrases:2746 old_phrases = word_entry.get("phrase")2747 if old_phrases:2748 phrases = phrases + old_phrases2749 word_entry["phrase"] = phrases2750def main():2751 args = sys.argv[1:]2752 output_path = tkrzw_dict.GetCommandFlag(args, "--output", 1) or "union-body.tkh"2753 core_labels = set((tkrzw_dict.GetCommandFlag(args, "--core", 1) or "xa,wn").split(","))2754 full_def_labels = set((tkrzw_dict.GetCommandFlag(2755 args, "--full_def", 1) or "ox,wn,we").split(","))2756 gross_labels = set((tkrzw_dict.GetCommandFlag(args, "--gross", 1) or "wj").split(","))2757 top_labels = set((tkrzw_dict.GetCommandFlag(args, "--top", 1) or "we,lx,xa").split(","))2758 slim_labels = set((tkrzw_dict.GetCommandFlag(args, "--slim", 1) or "ox,we,wj").split(","))2759 surfeit_labels = set((tkrzw_dict.GetCommandFlag(args, "--surfeit", 1) or "we").split(","))2760 tran_list_labels = set((tkrzw_dict.GetCommandFlag(2761 args, "--tran_list", 1) or "xa,wn,we").split(","))2762 supplement_labels = set((tkrzw_dict.GetCommandFlag(args, "--supplement", 1) or "xs").split(","))2763 phrase_prob_path = tkrzw_dict.GetCommandFlag(args, "--phrase_prob", 1) or ""2764 tran_prob_path = tkrzw_dict.GetCommandFlag(args, "--tran_prob", 1) or ""2765 tran_aux_paths = (tkrzw_dict.GetCommandFlag(args, "--tran_aux", 1) or "").split(",")2766 tran_aux_last_paths = (tkrzw_dict.GetCommandFlag(args, "--tran_aux_last", 1) or "").split(",")2767 rev_prob_path = tkrzw_dict.GetCommandFlag(args, "--rev_prob", 1) or ""2768 cooc_prob_path = tkrzw_dict.GetCommandFlag(args, "--cooc_prob", 1) or ""2769 aoa_paths = (tkrzw_dict.GetCommandFlag(args, "--aoa", 1) or "").split(",")2770 keyword_path = tkrzw_dict.GetCommandFlag(args, "--keyword", 1) or ""2771 min_prob_exprs = tkrzw_dict.GetCommandFlag(args, "--min_prob", 1) or ""2772 min_prob_map = {}2773 for min_prob_expr in min_prob_exprs.split(","):2774 columns = min_prob_expr.split(":")2775 if len(columns) == 2:2776 min_prob_map[columns[0]] = float(columns[1])2777 if tkrzw_dict.GetCommandFlag(args, "--quiet", 0):2778 logger.setLevel(logging.ERROR)2779 unused_flag = tkrzw_dict.GetUnusedFlag(args)2780 if unused_flag:2781 raise RuntimeError("Unknow flag: " + unused_flag)2782 inputs = tkrzw_dict.GetArguments(args)2783 if not inputs:2784 raise RuntimeError("inputs are required")2785 input_confs = []2786 for input in inputs:2787 input_conf = input.split(":", 1)2788 if len(input_conf) != 2:2789 raise RuntimeError("invalid input: " + input)2790 input_confs.append(input_conf)2791 BuildUnionDBBatch(input_confs, output_path, core_labels, full_def_labels, gross_labels,2792 surfeit_labels, top_labels, slim_labels, tran_list_labels, supplement_labels,2793 phrase_prob_path, tran_prob_path, tran_aux_paths, tran_aux_last_paths,2794 rev_prob_path, cooc_prob_path, aoa_paths, keyword_path,2795 min_prob_map).Run()2796if __name__=="__main__":...
shorte_engine.py
Source:shorte_engine.py
...377 def add_wikiword(self, word):378 if(self.m_wiki_links.has_key(word)):379 FATAL("Wikiword %s already exists" % word)380 self.m_wiki_links[word.wikiword] = word381 def is_wiki_word(self, phrase):382 '''Returns the target link if the phrase is a wikiword383 or None if it does not exist'''384 link = None385 if(self.m_wiki_links.has_key(phrase)):386 link = self.m_wiki_links[phrase]387 return link388 def inkscape_to_png(self, name):389 '''This method is called to convert an inkscape390 SVG to PNG format for embedding in a document'''391 input = os.path.abspath(name)392 parts = os.path.splitext(input)393 basename = parts[0]394 #print "input = %s" % input395 #print "basename = %s" % basename...
template_sql.py
Source:template_sql.py
...294 if(tmp == word):295 exclude_word = True296 break297 if(not exclude_word):298 link = self.m_engine.is_wiki_word(word)299 if(link != None):300 tmp = "<a href='%s#%s'>%s</a>" % (self.get_output_path(link), word, word)301 #print "WIKIWORD: %s" % tmp302 output += tmp303 else:304 #print "HERE I AM: %s" % word305 output += word306 return output307 308 def format_text(self, data, allow_wikify=True, exclude_wikify=[], expand_equals_block=False):309 if(data == None):310 return311 if(len(data) != 0):312 data = re.sub("'", "'", data)...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!