Best Python code snippet using nose2
morfeusz_builder
Source:morfeusz_builder
1#!/usr/bin/python2# -*- coding:utf-8 -*-3'''4Created on 21 paź 20135@author: mlenart6'''7import os8import sys9import logging10import codecs11from morfeuszbuilder.fsa import encode12from morfeuszbuilder.fsa import convertinput13from morfeuszbuilder.fsa.fsa import FSA14from morfeuszbuilder.fsa.serializer import Serializer, SerializationMethod15from morfeuszbuilder.tagset.tagset import Tagset16from morfeuszbuilder.segrules import rulesParser17from morfeuszbuilder.utils import exceptions, limits18from optparse import OptionParser19def _checkOption(opt, parser, msg):20 if opt is None:21 print >> sys.stderr, msg22 parser.print_help()23 exit(1)24def _checkCondition(cond, parser, msg):25 if not cond:26 print >> sys.stderr, msg27 parser.print_help()28 exit(1)29def _parseListCallback(option, opt, value, parser):30 setattr(parser.values, option.dest, value.split(','))31def _checkOpen(filename, mode):32 try:33 with open(filename, mode) as _:34 pass35 if 'w' in mode:36 os.remove(filename)37 except IOError as ex:38 print >> sys.stderr, str(ex)39 exit(1)40def _getDictFilename(opts, isGenerator):41 typeCode = 's' if isGenerator else 'a'42 fname = '%s-%s.dict' % (opts.dictName, typeCode)43 return os.path.join(opts.dictDir, fname)44def _parseOptions():45 """46 Parses commandline args47 """48 parser = OptionParser()49 parser.add_option('--input-files',50 type='string',51 dest='inputFiles',52 action='callback',53 callback=_parseListCallback,54 metavar='FILES',55 help='comma separated list of dictionary files')56 parser.add_option('--tagset-file',57 dest='tagsetFile',58 metavar='FILE',59 help='path to the file with tagset')60 parser.add_option('--segments-file',61 dest='segmentsFile',62 metavar='FILE',63 help='path to the file with segment rules')64 #~ parser.add_option('--trim-supneg',65 #~ dest='trimSupneg',66 #~ default=False,67 #~ action='store_true',68 #~ help='this option is ignored and exists only for backwards compatibility')69 parser.add_option('--dict',70 dest='dictName',71 help='the name of result dictionary')72 parser.add_option('--dict-dir',73 dest='dictDir',74 metavar='FILE',75 default=os.getcwd(),76 help='path to output directory (the default is current dir)')77 parser.add_option('--only-analyzer',78 dest='onlyAnalyzer',79 action='store_true',80 default=False,81 help='Generate dictionary for morphological analysis only (default is both analysis and synthesis)')82 parser.add_option('--only-generator',83 dest='onlyGenerator',84 action='store_true',85 default=False,86 help='Generate dictionary for morphological synthesis only (default is both analysis and synthesis)')87 parser.add_option('--analyzer-cpp',88 dest='analyzerCpp',89 metavar='FILE',90 help='Encode analyzer dictionary data in given c++ file')91 parser.add_option('--generator-cpp',92 dest='generatorCpp',93 metavar='FILE',94 help='Encode generator dictionary data in given c++ file')95 #~ parser.add_option('--use-arrays',96 #~ dest='useArrays',97 #~ action='store_true',98 #~ default=False,99 #~ help='store states reachable by 2 transitions in arrays (should speed up recognition, available only when --serialization-method=V1)')100 parser.add_option('--serialization-method',101 dest='serializationMethod',102 default='V1',103 help="FSA serialization method: \104 SIMPLE - fixed-length transitions, fastest and weakest compression \105 V1 - variable-length transitions, compressed labels - strongest compression \106 V2 - format similar to the default in Jan Daciuk's fsa package - variable-length transitions, non-compressed labels - good compression, good speed")107 #~ parser.add_option('--visualize',108 #~ dest='visualize',109 #~ action='store_true',110 #~ default=False,111 #~ help='visualize result')112 parser.add_option('--analyzer-train-file',113 dest='analyzerTrainFile',114 help='A text file used for analyzer training. Should contain words from some large corpus - one word in each line. Resulting analysis automaton should be faster with proper train file.')115 parser.add_option('--generator-train-file',116 dest='generatorTrainFile',117 help='A text file used for generator training. Should contain words from some large corpus - one word in each line. Resulting synthesis automaton should be faster with proper train file.')118 parser.add_option('--debug',119 dest='debug',120 action='store_true',121 default=False,122 help='output some debugging info')123 #~ parser.add_option('--profile',124 #~ dest='profile',125 #~ action='store_true',126 #~ default=False,127 #~ help='show profiling graph (required pycallgraph and graphviz')128 opts, args = parser.parse_args()129 _checkOption(opts.inputFiles, parser, "Input file is missing")130 _checkOption(opts.dictDir, parser, "Output dictionary dir is missing")131 _checkCondition((opts.onlyAnalyzer, opts.onlyGenerator) != (True, True),132 parser, 'Cannot set both --only-analyzer and --only-generator')133 writeCpp = {opts.analyzerCpp, opts.generatorCpp} != {None}134 _checkCondition(opts.dictName or writeCpp, parser, "Dictionary name is missing")135 _checkCondition(opts.onlyGenerator or opts.analyzerCpp or not writeCpp, parser, "Analyzer .cpp output file path is missing")136 _checkCondition(opts.onlyAnalyzer or opts.generatorCpp or not writeCpp, parser, "Generator .cpp output file path is missing")137 #~ _checkCondition((opts.dictName, opts.outputCpp) != (None, None),138 #~ parser, 'Must set at least one of: --dict-name, --output-cpp')139 #~ _checkOption(opts.outputFile, parser, "Output file is missing")140 _checkOption(opts.tagsetFile, parser, "Tagset file is missing")141 _checkOption(opts.segmentsFile, parser, "Segmentation file is missing")142 #~ _checkOption(opts.serializationMethod, parser, "Serialization method file is missing")143 #~ _checkExactlyOneOptionSet([opts.analyzer, opts.generator],144 #~ parser, 'Must set exactly one FSA type: --analyzer or --generator')145 _checkOpen(opts.tagsetFile, 'r')146 _checkOpen(opts.segmentsFile, 'r')147 for filename in opts.inputFiles:148 _checkOpen(filename, 'r')149 if not opts.onlyGenerator:150 _checkOpen(_getDictFilename(opts, isGenerator=False), 'w')151 if not opts.onlyAnalyzer:152 _checkOpen(_getDictFilename(opts, isGenerator=True), 'w')153 if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1]:154 print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1])+')'155 parser.print_help()156 exit(1)157 return opts158def _concatFiles(inputFiles):159 for inputFile in inputFiles:160 if inputFile:161 with open(inputFile, 'r') as f:162 for line in f:163 yield line164def _readDictIdAndCopyright(inputFiles):165 dictId = None166 copyright = None167 for inputFile in inputFiles:168 if inputFile:169 with codecs.open(inputFile, 'r', 'utf8') as f:170 inCopyright = False171 for linenum, line in enumerate(f, start=1):172 if dictId is None and line.startswith(u'#!DICT-ID'):173 dictIdTag, _, dictId = line.strip().partition(u' ')174 exceptions.validate(175 dictIdTag == u'#!DICT-ID',176 u'Dictionary ID tag must be followed by a space character and dictionary ID string')177 exceptions.validate(178 len(line.split(u' ')) > 1,179 u'%s:%d: Must provide DICT-ID' % (inputFile, linenum))180 exceptions.validate(181 len(line.split(u' ')) == 2,182 u'%s:%d: DICT-ID must not contain spaces' % (inputFile, linenum))183 elif copyright is None and line.startswith(u'#<COPYRIGHT>'):184 exceptions.validate(185 line.strip() == u'#<COPYRIGHT>',186 u'%s:%d: Copyright start tag must be the only one in the line' % (inputFile, linenum))187 inCopyright = True188 copyright = u''189 elif line.startswith(u'#</COPYRIGHT>'):190 exceptions.validate(191 inCopyright,192 u'%s:%d: Copyright end tag must be preceded by copyright start tag' % (inputFile, linenum))193 exceptions.validate(194 line.strip() == u'#</COPYRIGHT>',195 u'%s:%d: Copyright end tag must be the only one in the line' % (inputFile, linenum))196 inCopyright = False197 elif inCopyright:198 copyright += line199 if dictId is None:200 logging.warning(u'No dictionary ID tag found')201 dictId = u''202 if copyright is None:203 logging.warning(u'No copyright info found')204 copyright = u''205 return (dictId, copyright)206def _readNamesAndQualifiers(inputFiles):207 names = set([u''])208 qualifiers = set([frozenset()])209 lineParser = convertinput.LineParser()210 for line in _concatFiles(inputFiles):211 line = line.strip()212 if hasattr(line, 'decode'):213 # Py2.7214 line = line.decode('utf8')215 if not lineParser.ignoreLine(line):216 _, _, _, name, qualifier = lineParser.parseLine(line)217 names.add(name)218 qualifiers.add(convertinput.parseQualifiers(qualifier))219 namesMap = dict([(name, idx) for idx, name in enumerate(sorted(list(names)))])220 qualifiersMap = dict([(quals, idx) for idx, quals in enumerate(sorted(qualifiers, key=lambda q: tuple(sorted(q))))])221 exceptions.validate(222 len(qualifiersMap) <= limits.MAX_QUALIFIERS_COMBINATIONS,223 u'Too many qualifiers combinations. The limit is %d' % limits.MAX_QUALIFIERS_COMBINATIONS)224 return namesMap, qualifiersMap225def _readPolimorfInput4Analyzer(inputFiles, tagset, namesMap, qualifiersMap, encoder, segmentRulesManager):226 logging.info('reading analyzer data from %s', str(inputFiles))227 for entry in convertinput.PolimorfConverter4Analyzer(tagset, namesMap, qualifiersMap, encoder, 'utf8', segmentRulesManager).convert(_concatFiles(inputFiles)):228 yield entry229def _readPolimorfInput4Generator(inputFiles, tagset, namesMap, qualifiersMap, encoder, segmentRulesManager):230 logging.info('reading generator data from %s', str(inputFiles))231 for entry in convertinput.PolimorfConverter4Generator(tagset, namesMap, qualifiersMap, encoder, 'utf8', segmentRulesManager).convert(_concatFiles(inputFiles)):232 yield entry233def _readTrainData(trainFile):234 with codecs.open(trainFile, 'r', 'utf8') as f:235 for line in f:236 yield line.strip()237def _printStats(fsa):238 acceptingNum = 0239 sinkNum = 0240 arrayNum = 0241 for s in fsa.dfs():242 if s.isAccepting():243 acceptingNum += 1244 if s.transitionsNum == 0:245 sinkNum += 1246 if s.serializeAsArray:247 arrayNum += 1248 logging.info('states num: '+str(fsa.getStatesNum()))249 logging.info('transitions num: '+str(fsa.getTransitionsNum()))250 logging.info('accepting states num: '+str(acceptingNum))251 logging.info('sink states num: '+str(sinkNum))252 logging.info('array states num: '+str(arrayNum))253def buildAnalyzerFromPoliMorf(inputFiles, tagset, namesMap, qualifiersMap, segmentRulesManager):254 encoder = encode.MorphEncoder()255 fsa = FSA(encoder, tagset)256 for word, data in _readPolimorfInput4Analyzer(inputFiles, tagset, namesMap, qualifiersMap, encoder, segmentRulesManager):257# print word, data258 fsa.addEntry(word, data)259 del word260 del data261 fsa.close()262 logging.info('------')263 logging.info('Analyzer FSA stats:')264 logging.info('------')265 _printStats(fsa)266 return fsa267def buildGeneratorFromPoliMorf(inputFiles, tagset, namesMap, qualifiersMap, segmentRulesManager):268 encoder = encode.Encoder4Generator()269 fsa = FSA(encoder, tagset)270 inputData = _readPolimorfInput4Generator(inputFiles, tagset, namesMap, qualifiersMap, encoder, segmentRulesManager)271 for word, data in inputData:272 fsa.addEntry(word, data)273 fsa.close()274 logging.info('------')275 logging.info('Generator FSA stats:')276 logging.info('------')277 _printStats(fsa)278 return fsa279def _doBuildDictionaryPart(opts, dictId, copyrightTxt, tagset, namesMap, qualifiersMap, isGenerator):280 logging.info('reading segmentation rules')281 rulesParserVersion = rulesParser.RulesParser.PARSE4ANALYZER if not isGenerator else rulesParser.RulesParser.PARSE4GENERATOR282 segmentRulesManager = rulesParser.RulesParser(tagset, namesMap, qualifiersMap, rulesParserVersion).parse(opts.segmentsFile)283 segmentationRulesData = segmentRulesManager.serialize()284 logging.info('done reading segmentation rules')285 logging.info('building automaton')286 buildFunction = buildAnalyzerFromPoliMorf if not isGenerator else buildGeneratorFromPoliMorf287 fsa = buildFunction(opts.inputFiles, tagset, namesMap, qualifiersMap, segmentRulesManager)288 logging.info('done building automaton')289 if not isGenerator and opts.analyzerTrainFile:290 logging.info('training analyzer automaton with '+opts.analyzerTrainFile+' ...')291 fsa.train(_readTrainData(opts.analyzerTrainFile))292 logging.info('done training')293 if isGenerator and opts.generatorTrainFile:294 logging.info('training generator automaton with '+opts.generatorTrainFile+' ...')295 fsa.train(_readTrainData(opts.analyzerTrainFile))296 logging.info('done training')297 serializer = Serializer.getSerializer(opts.serializationMethod, fsa, dictId, copyrightTxt, tagset, namesMap, qualifiersMap, segmentationRulesData)298 if opts.generatorCpp and isGenerator:299 serializer.serialize2CppFile(opts.generatorCpp, isGenerator=isGenerator)300 if opts.analyzerCpp and not isGenerator:301 serializer.serialize2CppFile(opts.analyzerCpp, isGenerator=isGenerator)302 if opts.dictDir:303 serializer.serialize2BinaryFile(_getDictFilename(opts, isGenerator=isGenerator), isGenerator=isGenerator)304 logging.info('total FSA size (in bytes): '+str(fsa.initialState.reverseOffset))305def main(opts):306 if opts.debug:307 logging.basicConfig(level=logging.DEBUG)308 else:309 logging.basicConfig(level=logging.INFO)310 logging.info('reading tagset')311 tagset = Tagset(opts.tagsetFile)312 logging.info('done reading tagset')313 logging.info('reading names and qualifiers')314 dictId, copyrightTxt = _readDictIdAndCopyright(opts.inputFiles)315 namesMap, qualifiersMap = _readNamesAndQualifiers(opts.inputFiles)316 logging.info('done reading names and qualifiers')317 if not opts.onlyGenerator:318 _doBuildDictionaryPart(opts, dictId, copyrightTxt, tagset, namesMap, qualifiersMap, isGenerator=False)319 if not opts.onlyAnalyzer:320 _doBuildDictionaryPart(opts, dictId, copyrightTxt, tagset, namesMap, qualifiersMap, isGenerator=True)321if __name__ == '__main__':322 import os323 opts = _parseOptions()324 #~ try:325 main(opts)326 #~ except Exception as ex:327 #~ print >> sys.stderr, u'Building dictionary file failed:', unicode(ex).encode('utf8'), 'type of error:', type(ex)328 #~ sys.exit(1)329 #~ finally:...
morfeusz_builder.py
Source:morfeusz_builder.py
1#!/usr/bin/python32# -*- coding:utf-8 -*-3'''4Created on 21 paź 20135@author: mlenart6'''7import os8import sys9import logging10import codecs11from morfeuszbuilder.fsa import encode12from morfeuszbuilder.fsa import convertinput13from morfeuszbuilder.fsa.fsa import FSA14from morfeuszbuilder.fsa.serializer import Serializer, SerializationMethod15from morfeuszbuilder.tagset.tagset import Tagset16from morfeuszbuilder.segrules import rulesParser17from morfeuszbuilder.utils import exceptions, limits18from optparse import OptionParser19def _checkOption(opt, parser, msg):20 if opt is None:21 print(msg, file=sys.stderr)22 parser.print_help()23 exit(1)24def _checkCondition(cond, parser, msg):25 if not cond:26 print(msg, file=sys.stderr)27 parser.print_help()28 exit(1)29def _parseListCallback(option, opt, value, parser):30 setattr(parser.values, option.dest, value.split(','))31def _checkOpen(filename, mode):32 try:33 with open(filename, mode) as _:34 pass35 if 'w' in mode:36 os.remove(filename)37 except IOError as ex:38 print >> sys.stderr, str(ex)39 exit(1)40def _getDictFilename(opts, isGenerator):41 typeCode = 's' if isGenerator else 'a'42 fname = '%s-%s.dict' % (opts.dictName, typeCode)43 return os.path.join(opts.dictDir, fname)44def _parseOptions():45 """46 Parses commandline args47 """48 parser = OptionParser()49 parser.add_option('--input-files',50 type='string',51 dest='inputFiles',52 action='callback',53 callback=_parseListCallback,54 metavar='FILES',55 help='comma separated list of dictionary files')56 parser.add_option('--tagset-file',57 dest='tagsetFile',58 metavar='FILE',59 help='path to the file with tagset')60 parser.add_option('--segments-file',61 dest='segmentsFile',62 metavar='FILE',63 help='path to the file with segment rules')64 #~ parser.add_option('--trim-supneg',65 #~ dest='trimSupneg',66 #~ default=False,67 #~ action='store_true',68 #~ help='this option is ignored and exists only for backwards compatibility')69 parser.add_option('--dict',70 dest='dictName',71 help='the name of result dictionary')72 parser.add_option('--dict-dir',73 dest='dictDir',74 metavar='FILE',75 default=os.getcwd(),76 help='path to output directory (the default is current dir)')77 parser.add_option('--only-analyzer',78 dest='onlyAnalyzer',79 action='store_true',80 default=False,81 help='Generate dictionary for morphological analysis only (default is both analysis and synthesis)')82 parser.add_option('--only-generator',83 dest='onlyGenerator',84 action='store_true',85 default=False,86 help='Generate dictionary for morphological synthesis only (default is both analysis and synthesis)')87 parser.add_option('--analyzer-cpp',88 dest='analyzerCpp',89 metavar='FILE',90 help='Encode analyzer dictionary data in given c++ file')91 parser.add_option('--generator-cpp',92 dest='generatorCpp',93 metavar='FILE',94 help='Encode generator dictionary data in given c++ file')95 #~ parser.add_option('--use-arrays',96 #~ dest='useArrays',97 #~ action='store_true',98 #~ default=False,99 #~ help='store states reachable by 2 transitions in arrays (should speed up recognition, available only when --serialization-method=V1)')100 parser.add_option('--serialization-method',101 dest='serializationMethod',102 default='V1',103 help="FSA serialization method: \104 SIMPLE - fixed-length transitions, fastest and weakest compression \105 V1 - variable-length transitions, compressed labels - strongest compression \106 V2 - format similar to the default in Jan Daciuk's fsa package - variable-length transitions, non-compressed labels - good compression, good speed")107 #~ parser.add_option('--visualize',108 #~ dest='visualize',109 #~ action='store_true', 110 #~ default=False,111 #~ help='visualize result')112 parser.add_option('--analyzer-train-file',113 dest='analyzerTrainFile',114 help='A text file used for analyzer training. Should contain words from some large corpus - one word in each line. Resulting analysis automaton should be faster with proper train file.')115 parser.add_option('--generator-train-file',116 dest='generatorTrainFile',117 help='A text file used for generator training. Should contain words from some large corpus - one word in each line. Resulting synthesis automaton should be faster with proper train file.')118 parser.add_option('--debug',119 dest='debug',120 action='store_true',121 default=False,122 help='output some debugging info')123 #~ parser.add_option('--profile',124 #~ dest='profile',125 #~ action='store_true',126 #~ default=False,127 #~ help='show profiling graph (required pycallgraph and graphviz')128 129 opts, args = parser.parse_args()130 131 _checkOption(opts.inputFiles, parser, "Input file is missing")132 _checkOption(opts.dictDir, parser, "Output dictionary dir is missing")133 _checkCondition((opts.onlyAnalyzer, opts.onlyGenerator) != (True, True), 134 parser, 'Cannot set both --only-analyzer and --only-generator')135 writeCpp = {opts.analyzerCpp, opts.generatorCpp} != {None}136 _checkCondition(opts.dictName or writeCpp, parser, "Dictionary name is missing")137 _checkCondition(opts.onlyGenerator or opts.analyzerCpp or not writeCpp, parser, "Analyzer .cpp output file path is missing")138 _checkCondition(opts.onlyAnalyzer or opts.generatorCpp or not writeCpp, parser, "Generator .cpp output file path is missing")139 #~ _checkCondition((opts.dictName, opts.outputCpp) != (None, None), 140 #~ parser, 'Must set at least one of: --dict-name, --output-cpp')141 #~ _checkOption(opts.outputFile, parser, "Output file is missing")142 _checkOption(opts.tagsetFile, parser, "Tagset file is missing")143 _checkOption(opts.segmentsFile, parser, "Segmentation file is missing")144 #~ _checkOption(opts.serializationMethod, parser, "Serialization method file is missing")145 #~ _checkExactlyOneOptionSet([opts.analyzer, opts.generator], 146 #~ parser, 'Must set exactly one FSA type: --analyzer or --generator')147 148 _checkOpen(opts.tagsetFile, 'r')149 _checkOpen(opts.segmentsFile, 'r')150 for filename in opts.inputFiles:151 _checkOpen(filename, 'r')152 if not opts.onlyGenerator:153 _checkOpen(_getDictFilename(opts, isGenerator=False), 'w')154 if not opts.onlyAnalyzer:155 _checkOpen(_getDictFilename(opts, isGenerator=True), 'w')156 157 if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1]:158 print('--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1])+')', file=sys.stderr)159 parser.print_help()160 exit(1)161 162 return opts163def _concatFiles(inputFiles):164 for inputFile in inputFiles:165 if inputFile:166 with open(inputFile, 'r') as f:167 for line in f:168 yield line169def _readDictIdAndCopyright(inputFiles):170 dictId = None171 copyright = None172 for inputFile in inputFiles:173 if inputFile:174 with codecs.open(inputFile, 'r', 'utf8') as f:175 inCopyright = False176 for linenum, line in enumerate(f, start=1):177 if dictId is None and line.startswith(u'#!DICT-ID'):178 dictIdTag, _, dictId = line.strip().partition(u' ')179 exceptions.validate(180 dictIdTag == u'#!DICT-ID',181 'Dictionary ID tag must be followed by a space character and dictionary ID string')182 exceptions.validate(183 len(line.split(u' ')) > 1,184 '%s:%d: Must provide DICT-ID' % (inputFile, linenum))185 exceptions.validate(186 len(line.split(u' ')) == 2,187 '%s:%d: DICT-ID must not contain spaces' % (inputFile, linenum))188 elif copyright is None and line.startswith(u'#<COPYRIGHT>'):189 exceptions.validate(190 line.strip() == u'#<COPYRIGHT>',191 '%s:%d: Copyright start tag must be the only one in the line' % (inputFile, linenum))192 inCopyright = True193 copyright = ''194 elif line.startswith('#</COPYRIGHT>'):195 exceptions.validate(196 inCopyright,197 '%s:%d: Copyright end tag must be preceded by copyright start tag' % (inputFile, linenum))198 exceptions.validate(199 line.strip() == u'#</COPYRIGHT>',200 '%s:%d: Copyright end tag must be the only one in the line' % (inputFile, linenum))201 inCopyright = False202 elif inCopyright:203 copyright += line204 if dictId is None:205 logging.warn('No dictionary ID tag found')206 dictId = ''207 if copyright is None:208 logging.warn('No copyright info found')209 copyright = ''210 return (dictId, copyright)211def _readNamesAndQualifiers(inputFiles):212 names = set([''])213 qualifiers = set([frozenset()])214 lineParser = convertinput.LineParser()215 for line in _concatFiles(inputFiles):216 line = line.strip()217 if not lineParser.ignoreLine(line):218 _, _, _, name, qualifier = lineParser.parseLine(line)219 names.add(name)220 qualifiers.add(convertinput.parseQualifiers(qualifier))221 namesMap = dict([(name, idx) for idx, name in enumerate(sorted(list(names)))])222 qualifiersMap = dict([(quals, idx) for idx, quals in enumerate(sorted(qualifiers, key=lambda q: tuple(sorted(q))))])223 exceptions.validate(224 len(qualifiersMap) <= limits.MAX_QUALIFIERS_COMBINATIONS, 225 'Too many qualifiers combinations. The limit is %d' % limits.MAX_QUALIFIERS_COMBINATIONS)226 227 return namesMap, qualifiersMap228def _readPolimorfInput4Analyzer(inputFiles, tagset, namesMap, qualifiersMap, encoder, segmentRulesManager):229 logging.info('reading analyzer data from %s', str(inputFiles))230 for entry in convertinput.PolimorfConverter4Analyzer(tagset, namesMap, qualifiersMap, encoder, 'utf8', segmentRulesManager).convert(_concatFiles(inputFiles)):231 yield entry232def _readPolimorfInput4Generator(inputFiles, tagset, namesMap, qualifiersMap, encoder, segmentRulesManager):233 logging.info('reading generator data from %s', str(inputFiles))234 for entry in convertinput.PolimorfConverter4Generator(tagset, namesMap, qualifiersMap, encoder, 'utf8', segmentRulesManager).convert(_concatFiles(inputFiles)):235 yield entry236def _readTrainData(trainFile):237 with codecs.open(trainFile, 'r', 'utf8') as f:238 for line in f:239 yield line.strip()240def _printStats(fsa):241 acceptingNum = 0242 sinkNum = 0243 arrayNum = 0244 for s in fsa.dfs():245 if s.isAccepting():246 acceptingNum += 1247 if s.transitionsNum == 0:248 sinkNum += 1249 if s.serializeAsArray:250 arrayNum += 1251 logging.info('states num: '+str(fsa.getStatesNum()))252 logging.info('transitions num: '+str(fsa.getTransitionsNum()))253 logging.info('accepting states num: '+str(acceptingNum))254 logging.info('sink states num: '+str(sinkNum))255 logging.info('array states num: '+str(arrayNum))256def buildAnalyzerFromPoliMorf(inputFiles, tagset, namesMap, qualifiersMap, segmentRulesManager):257 encoder = encode.MorphEncoder()258 fsa = FSA(encoder, tagset)259 for word, data in _readPolimorfInput4Analyzer(inputFiles, tagset, namesMap, qualifiersMap, encoder, segmentRulesManager):260# print word, data261 fsa.addEntry(word, data)262 del word263 del data264 fsa.close()265 logging.info('------')266 logging.info('Analyzer FSA stats:')267 logging.info('------')268 _printStats(fsa)269 return fsa270def buildGeneratorFromPoliMorf(inputFiles, tagset, namesMap, qualifiersMap, segmentRulesManager):271 encoder = encode.Encoder4Generator()272 fsa = FSA(encoder, tagset)273 inputData = _readPolimorfInput4Generator(inputFiles, tagset, namesMap, qualifiersMap, encoder, segmentRulesManager)274 for word, data in inputData:275 fsa.addEntry(word, data)276 fsa.close()277 logging.info('------')278 logging.info('Generator FSA stats:')279 logging.info('------')280 _printStats(fsa)281 return fsa282def _doBuildDictionaryPart(opts, dictId, copyrightTxt, tagset, namesMap, qualifiersMap, isGenerator):283 284 logging.info('reading segmentation rules')285 rulesParserVersion = rulesParser.RulesParser.PARSE4ANALYZER if not isGenerator else rulesParser.RulesParser.PARSE4GENERATOR286 segmentRulesManager = rulesParser.RulesParser(tagset, namesMap, qualifiersMap, rulesParserVersion).parse(opts.segmentsFile)287 segmentationRulesData = segmentRulesManager.serialize()288 logging.info('done reading segmentation rules')289 290 logging.info('building automaton')291 buildFunction = buildAnalyzerFromPoliMorf if not isGenerator else buildGeneratorFromPoliMorf292 fsa = buildFunction(opts.inputFiles, tagset, namesMap, qualifiersMap, segmentRulesManager)293 logging.info('done building automaton')294 295 if not isGenerator and opts.analyzerTrainFile:296 logging.info('training analyzer automaton with '+opts.analyzerTrainFile+' ...')297 fsa.train(_readTrainData(opts.analyzerTrainFile))298 logging.info('done training')299 300 if isGenerator and opts.generatorTrainFile:301 logging.info('training generator automaton with '+opts.generatorTrainFile+' ...')302 fsa.train(_readTrainData(opts.analyzerTrainFile))303 logging.info('done training')304 305 serializer = Serializer.getSerializer(opts.serializationMethod, fsa, dictId, copyrightTxt, tagset, namesMap, qualifiersMap, segmentationRulesData)306 if opts.generatorCpp and isGenerator:307 serializer.serialize2CppFile(opts.generatorCpp, isGenerator=isGenerator)308 if opts.analyzerCpp and not isGenerator:309 serializer.serialize2CppFile(opts.analyzerCpp, isGenerator=isGenerator)310 311 if opts.dictDir:312 serializer.serialize2BinaryFile(_getDictFilename(opts, isGenerator=isGenerator), isGenerator=isGenerator)313 314 logging.info('total FSA size (in bytes): '+str(fsa.initialState.reverseOffset))315def main(opts):316 if opts.debug:317 logging.basicConfig(level=logging.DEBUG)318 else:319 logging.basicConfig(level=logging.INFO)320 321 logging.info('reading tagset')322 tagset = Tagset(opts.tagsetFile)323 logging.info('done reading tagset')324 325 logging.info('reading names and qualifiers')326 dictId, copyrightTxt = _readDictIdAndCopyright(opts.inputFiles)327 namesMap, qualifiersMap = _readNamesAndQualifiers(opts.inputFiles)328 logging.info('done reading names and qualifiers')329 330 if not opts.onlyGenerator:331 _doBuildDictionaryPart(opts, dictId, copyrightTxt, tagset, namesMap, qualifiersMap, isGenerator=False)332 if not opts.onlyAnalyzer:333 _doBuildDictionaryPart(opts, dictId, copyrightTxt, tagset, namesMap, qualifiersMap, isGenerator=True)334if __name__ == '__main__':335 import os336 opts = _parseOptions()...
test_account.py
Source:test_account.py
...44 _ = logged_in_account.username45 assert logged_in_account.n_requests == n + 146def test_iter_lists(once_logged_in_account):47 lists = once_logged_in_account.iter_lists()48 assert inspect.isgenerator(lists)49 assert all(isinstance(x, TMDbList) for x in lists)50def test_iter_favorite_movies(once_logged_in_account):51 favorite_movies = once_logged_in_account.iter_favorite_movies()52 assert inspect.isgenerator(favorite_movies)53 assert all(isinstance(x, Movie) for x in favorite_movies)54def test_iter_favorite_shows(once_logged_in_account):55 favorite_shows = once_logged_in_account.iter_favorite_shows()56 assert inspect.isgenerator(favorite_shows)57 assert all(isinstance(x, Show) for x in favorite_shows)58def test_iter_rated_movies(once_logged_in_account):59 rated_movies = once_logged_in_account.iter_rated_movies()60 assert inspect.isgenerator(rated_movies)61 assert all(isinstance(x, Movie) for x in rated_movies)62def test_iter_rated_shows(once_logged_in_account):63 rated_shows = once_logged_in_account.iter_rated_shows()64 assert inspect.isgenerator(rated_shows)65 assert all(isinstance(x, Show) for x in rated_shows)66def test_iter_rated_episodes(once_logged_in_account):67 rated_episodes = once_logged_in_account.iter_rated_episodes()68 assert inspect.isgenerator(rated_episodes)69 assert all(isinstance(x, Episode) for x in rated_episodes)70def test_iter_movie_watchlist(once_logged_in_account):71 movie_watchlist = once_logged_in_account.iter_movie_watchlist()72 assert inspect.isgenerator(movie_watchlist)73 assert all(isinstance(x, Movie) for x in movie_watchlist)74def test_iter_show_watchlist(once_logged_in_account):75 show_watchlist = once_logged_in_account.iter_show_watchlist()76 assert inspect.isgenerator(show_watchlist)77 assert all(isinstance(x, Movie) for x in show_watchlist)78def test_mark_as_favorite(once_logged_in_account):79 once_logged_in_account.remove_from_favorites(Movie(18148))80 r = once_logged_in_account.mark_as_favorite(Movie(18148))81 assert r["status_code"] == 182def test_remove_from_favorite(once_logged_in_account):83 once_logged_in_account.mark_as_favorite(Movie(18148))84 r = once_logged_in_account.remove_from_favorites(Movie(18148))85 assert r["status_code"] == 1386def test_add_to_watchlist(once_logged_in_account):87 once_logged_in_account.remove_from_watchlist(Movie(18148))88 r = once_logged_in_account.add_to_watchlist(Movie(18148))89 assert r["status_code"] == 190def test_remove_from_watchlist(once_logged_in_account):...
v1.py
Source:v1.py
1import json2from pathlib import Path3from typing import Dict, List, Optional4from cmake_file_api.kinds.common import CMakeSourceBuildPaths, VersionMajorMinor5from cmake_file_api.kinds.kind import ObjectKind6class CMakeFilesInput(object):7 __slots__ = ("path", "isGenerator", "isExternal", "isCMake")8 def __init__(self, path: Path, isGenerator: Optional[bool], isExternal: Optional[bool], isCMake: Optional[bool]):9 self.path = path10 self.isGenerator = isGenerator11 self.isExternal = isExternal12 self.isCMake = isCMake13 @classmethod14 def from_dict(cls, dikt: Dict) -> "CMakeFileInput":15 path = Path(dikt["path"])16 isGenerator = dikt.get("isGenerator")17 isExternal = dikt.get("isExternal")18 isCMake = dikt.get("isExternal")19 return cls(path, isGenerator, isExternal, isCMake)20 def __repr__(self) -> str:21 return "{}(path='{}', generator={}, external={}, cmake={})".format(22 type(self).__name__,23 self.path,24 self.isGenerator,25 self.isExternal,26 self.isCMake,27 )28class CMakeFilesV1(object):29 KIND = ObjectKind.CMAKEFILES30 __slots__ = ("version", "paths", "inputs")31 def __init__(self, version: VersionMajorMinor, paths: CMakeSourceBuildPaths, inputs: List[CMakeFilesInput]):32 self.version = version33 self.paths = paths34 self.inputs = inputs35 @classmethod36 def from_dict(cls, dikt: Dict, reply_path: Path) -> "CmakeFilesV2":37 version = VersionMajorMinor.from_dict(dikt["version"])38 paths = CMakeSourceBuildPaths.from_dict(dikt["paths"])39 inputs = list(CMakeFilesInput.from_dict(cmi) for cmi in dikt["inputs"])40 return cls(version, paths, inputs)41 @classmethod42 def from_path(cls, path: Path, reply_path: Path) -> "CmakeFilesV2":43 dikt = json.load(path.open())44 return cls.from_dict(dikt, reply_path)45 def __repr__(self) -> str:46 return "{}(version={}, paths={}, inputs={})".format(47 type(self).__name__,48 repr(self.version),49 self.paths,50 repr(self.inputs),...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!