Best Python code snippet using localstack_python
queries.py
Source:queries.py
1import pandas as pd2from pymongo import MongoClient3from backend.DB import eu, db4import backend.precomputing as precomputing5#from DB import eu, db6#import precomputing7########################################################################################################################8countries = ['NO', 'HR', 'HU', 'CH', 'CZ', 'RO', 'LV', 'GR', 'UK', 'SI', 'LT',9 'ES', 'FR', 'IE', 'SE', 'NL', 'PT', 'PL', 'DK', 'MK', 'DE', 'IT',10 'BG', 'CY', 'AT', 'LU', 'BE', 'FI', 'EE', 'SK', 'MT', 'LI', 'IS']11def year_country_filter(bot_year, top_year, country_list):12 filter_ = {13 '$match': {14 '$and': [{"_id.year": {'$gte': bot_year}}, {"_id.year": {'$lte': top_year}}],15 "_id.country": { '$in': country_list }16 }17 }18 return filter_19# the function below filters the procurement database20# by year (YEAR) and issuer countries (ISO_COUNTRY_CODE), matching the appropriate CPV_DIVISION.21def year_country_cpv_filter(bot_year, top_year, country_list,cpv=None):22 if cpv == None:23 filter_ = {24 '$match': {25 '$and': [{"YEAR": {'$gte': bot_year}}, {"YEAR": {'$lte': top_year}}],26 "ISO_COUNTRY_CODE": {'$in': country_list}}}27 else:28 filter_ = {29 '$match': {30 '$and': [{"YEAR": {'$gte': bot_year}}, {"YEAR": {'$lte': top_year}}],31 "ISO_COUNTRY_CODE": { '$in': country_list},32 "CPV_DIVISION":{'$eq':cpv}33 }34 }35 return filter_36# Apply the pipeline to retrieve 'cpv_division_description' from CPV collection,37# Save this value as a python dict 'description_dict' to be used later.38description_list = list(db.cpv.aggregate(precomputing.pipeline_cpv_div_desc))39description_dict = {}40for i in description_list:41 description_dict[i['_id']['cpv_division']] = i['_id']['cpv_division_description']42##################################################################################43#Queries for looking up the cpv description44lookup_cpv={"$lookup":{45 "from": "cpv_grouped",46 "localField": "_id",47 "foreignField": "_id.cpv_division",48 "as": "cpv_lookup"49 }}50unwind_cpv={"$unwind":"$cpv_lookup"}51###################################################################################52def ex0_cpv_example(bot_year=2008, top_year=2020):53 """54 Returns all contracts in given year 'YEAR' range and cap to 100000000 the 'VALUE_EURO'55 Expected Output (list of documents):56 [{'result': count_value(int)}]57 """58 def year_filter(bot_year, top_year):59 filter_ = {60 '$match': {61 '$and': [{'YEAR': {'$gte': bot_year}}, {'YEAR': {'$lte': top_year}}],62 'VALUE_EURO': {'$lt': 100000000}63 }}64 return filter_65 count = {66 '$count': 'result'67 }68 pipeline = [year_filter(bot_year, top_year), count]69 list_documents = list(eu.aggregate(pipeline))70 return list_documents71def ex1_cpv_box(bot_year=2008, top_year=2020, country_list=countries):72 """73 Returns five metrics, described below74 Result filterable by floor year, roof year and country_list75 Expected Output:76 (avg_cpv_euro_avg, avg_cpv_count, avg_cpv_offer_avg, avg_cpv_euro_avg_y_eu, avg_cpv_euro_avg_n_eu)77 Where:78 avg_cpv_euro_avg = average value of each CPV's division contracts average 'VALUE_EURO', (int)79 avg_cpv_count = average value of each CPV's division contract count, (int)80 avg_cpv_offer_avg = average value of each CPV's division contracts average NUMBER_OFFERS', (int)81 avg_cpv_euro_avg_y_eu = average value of each CPV's division contracts average VALUE_EURO' with 'B_EU_FUNDS', (int)82 avg_cpv_euro_avg_n_eu = average value of each CPV's division contracts average 'VALUE_EURO' with out 'B_EU_FUNDS' (int)83 """84 avg_cpv_euro_avg = None85 avg_cpv_count = None86 avg_cpv_offer_avg = None87 avg_cpv_euro_avg_y_eu = None88 avg_cpv_euro_avg_n_eu = None89 #### 1.190 query0 = {"$group": {"_id": "$_id.cpv_division", "sumOfSum": {"$sum": "$sum"}, "sumOfCounts": {"$sum": "$count"}}}91 query1 = {"$project": {"_id": "$_id", "avgValueEuro": {"$divide": ["$sumOfSum", "$sumOfCounts"]}}}92 query2 = {"$group": {"_id": {}, "avgValueEuro": {"$avg": "$avgValueEuro"}}}93 pipeline_1_1 = [year_country_filter(bot_year, top_year, country_list), query0,query1,query2]94 myCursor = db.cpv_euro_avg.aggregate(pipeline_1_1)95 for document in myCursor:96 avg_cpv_euro_avg = document['avgValueEuro']97 #### 1.298 query3 = {"$group": {"_id": "$_id.cpv_division",99 "count": {"$sum": "$count"}}}100 query4 = {"$group": {"_id": {}, "avgCount": {"$avg": "$count"}}}101 pipeline_1_2 = [year_country_filter(bot_year, top_year, country_list), query3, query4]102 myCursor = db.cpv_euro_avg.aggregate(pipeline_1_2)103 for document in myCursor:104 avg_cpv_count = document['avgCount']105 #### 1.3106 query5 = {"$group": {"_id": "$_id.cpv_division",107 "count": {"$sum": "$count"}, "sumOfOffers": {"$sum": "$countOffers"}}}108 query6 = {"$project": {"_id": "$_id", "avgNoOffers": {"$divide": ["$sumOfOffers", "$count"]}}}109 query7 = {"$group": {"_id": {}, "avgNoOffers": {"$avg": "$avgNoOffers"}}}110 pipeline_1_3 = [year_country_filter(bot_year, top_year, country_list), query5, query6,query7]111 myCursor = db.cpv_euro_avg.aggregate(pipeline_1_3)112 for document in myCursor:113 avg_cpv_offer_avg = document['avgNoOffers']114 #### 1.4115 query8 = {"$group": {"_id": "$_id.cpv_division", "sumOfSum": {"$sum": "$sumValueBEuFundsYes"},116 "sumOfCounts": {"$sum": "$countBEuFundsYes"}}}117 query9 = {"$project": {"_id": "$_id", "avgValueEuro_Funds": {"$divide": ["$sumOfSum", "$sumOfCounts"]}}}118 query10 = {"$group": {"_id": {}, "avgValueEuro_Funds": {"$avg": "$avgValueEuro_Funds"}}}119 pipeline_1_4 = [year_country_filter(bot_year, top_year, country_list), query8, query9,query10]120 myCursor = db.cpv_euro_avg.aggregate(pipeline_1_4)121 for document in myCursor:122 avg_cpv_euro_avg_y_eu = document['avgValueEuro_Funds']123 #### 1.5124 query11 = {"$group": {"_id": "$_id.cpv_division", "sumOfSum": {"$sum": "$sumValueBEuFundsNo"},125 "sumOfCounts": {"$sum": "$countBEuFundsNo"}}}126 query12 = {"$project": {"_id": "$_id", "avgValueEuro_Funds": {"$divide": ["$sumOfSum", "$sumOfCounts"]}}}127 query13 = {"$group": {"_id": {}, "avgValueEuro_Funds": {"$avg": "$avgValueEuro_Funds"}}}128 pipeline_1_5 = [year_country_filter(bot_year, top_year, country_list), query11, query12,query13]129 myCursor = db.cpv_euro_avg.aggregate(pipeline_1_5)130 for document in myCursor:131 avg_cpv_euro_avg_n_eu = document['avgValueEuro_Funds']132 return avg_cpv_euro_avg, avg_cpv_count, avg_cpv_offer_avg, avg_cpv_euro_avg_y_eu, avg_cpv_euro_avg_n_eu133def ex2_cpv_treemap(bot_year=2008, top_year=2020, country_list=countries):134 """135 Returns the count of contracts for each CPV Division136 Result filterable by floor year, roof year and country_list137 Expected Output (list of documents):138 [{cpv: value_1, count: value_2}, ....]139 Where:140 value_1 = CPV Division description, (string) (located in cpv collection as 'cpv_division_description')141 value_2 = contract count of each CPV Division, (int)142 """143 query1 = {"$group": {"_id": "$_id.cpv_division",144 "count": {"$sum": "$count"}}}145 pipeline_2 = [year_country_filter(bot_year, top_year, country_list), query1, lookup_cpv, unwind_cpv]146 # cursor from the pipeline above147 # it assumes the existence of a pre-calculated collection (defined in insert_operation)148 myCursor = db.cpv_euro_avg.aggregate(pipeline_2)149 list_documents = []150 for document in myCursor:151 list_documents.append({"cpv": document['cpv_lookup']['cpv_division_description'],152 "count": document['count']})153 return list_documents154def ex3_cpv_bar_1(bot_year=2008, top_year=2020, country_list=countries):155 """156 Per CPV Division and get the average 'VALUE_EURO' return the highest 5 cpvs157 Result filterable by floor year, roof year and country_list158 Expected Output (list of 5 sorted documents):159 [{cpv: value_1, avg: value_2}, ....]160 Where:161 value_1 = CPV Division description, (string) (located in cpv collection as 'cpv_division_description')162 value_2 = average 'VALUE_EURO' of each CPV Division, (float)163 """164 query0 = {"$group": {"_id": "$_id.cpv_division", "sumOfSum": {"$sum": "$sum"}, "sumOfCounts": {"$sum": "$count"}}}165 query1 = {"$project": {"_id": "$_id", "avgValueEuro": {"$divide": ["$sumOfSum", "$sumOfCounts"]}}}166 query2 = {"$sort": {"avgValueEuro": -1}}167 query3 = {"$limit": 5}168 pipeline_3 = [year_country_filter(bot_year, top_year, country_list),query0, query1, query2, query3, lookup_cpv, unwind_cpv]169 myCursor = db.cpv_euro_avg.aggregate(pipeline_3)170 list_documents = []171 for document in myCursor:172 list_documents.append({"cpv": document['cpv_lookup']['cpv_division_description'],173 "avg": document['avgValueEuro']})174 return list_documents175def ex4_cpv_bar_2(bot_year=2008, top_year=2020, country_list=countries):176 """177 Per CPV Division and get the average 'VALUE_EURO' return the lowest 5 cpvs178 Result filterable by floor year, roof year and country_list179 Expected Output (list of 5 sorted documents):180 [{cpv: value_1, avg: value_2}, ....]181 Where:182 value_1 = CPV Division description, (string) (located in cpv collection as 'cpv_division_description')183 value_2 = average 'VALUE_EURO' of each CPV Division, (float)184 """185 query0 = {"$group": {"_id": "$_id.cpv_division", "sumOfSum": {"$sum": "$sum"}, "sumOfCounts": {"$sum": "$count"}}}186 query1 = {"$project": {"_id": "$_id", "avgValueEuro": {"$divide": ["$sumOfSum", "$sumOfCounts"]}}}187 query2 = {"$sort": {"avgValueEuro": 1}}188 query3 = {"$limit": 5}189 pipeline_4 = [year_country_filter(bot_year, top_year, country_list), query0, query1, query2, query3, lookup_cpv,190 unwind_cpv]191 myCursor = db.cpv_euro_avg.aggregate(pipeline_4)192 list_documents = []193 for document in myCursor:194 list_documents.append({"cpv": document['cpv_lookup']['cpv_division_description'],195 "avg": document['avgValueEuro']})196 return list_documents197def ex5_cpv_bar_3(bot_year=2008, top_year=2020, country_list=countries):198 """199 Per CPV Division and get the average 'VALUE_EURO' return the highest 5 cpvs for contracts which recieved european funds ('B_EU_FUNDS')200 Result filterable by floor year, roof year and country_list201 Expected Output (list of 5 sorted documents):202 [{cpv: value_1, avg: value_2}, ....]203 Where:204 value_1 = CPV Division description, (string) (located in cpv collection as 'cpv_division_description')205 value_2 = average 'VALUE_EURO' of each CPV Division, (float)206 """207 query0 = {"$group": {"_id": "$_id.cpv_division", "sumOfSum": {"$sum": "$sumValueBEuFundsYes"},208 "sumOfCounts": {"$sum": "$countBEuFundsYes"}}}209 query1 = {"$project": {"_id": "$_id", "avgValueEuro_Funds": {"$divide": ["$sumOfSum", "$sumOfCounts"]}}}210 query2 = {"$sort": {"avgValueEuro_Funds": -1}}211 query3 = {"$limit": 5}212 pipeline_5 = [year_country_filter(bot_year, top_year, country_list), query0, query1, query2, query3, lookup_cpv,213 unwind_cpv]214 myCursor = db.cpv_euro_avg.aggregate(pipeline_5)215 list_documents = []216 for document in myCursor:217 list_documents.append({"cpv": document['cpv_lookup']['cpv_division_description'],218 "avg": document['avgValueEuro_Funds']})219 return list_documents220def ex6_cpv_bar_4(bot_year=2008, top_year=2020, country_list=countries):221 """222 Per CPV Division and get the average 'VALUE_EURO' return the highest 5 cpvs for contracts which did not recieve european funds ('B_EU_FUNDS')223 Result filterable by floor year, roof year and country_list224 Expected Output (list of 5 sorted documents):225 [{cpv: value_1, avg: value_2}, ....]226 Where:227 value_1 = CPV Division description, (string) (located in cpv collection as 'cpv_division_description')228 value_2 = average 'VALUE_EURO' of each CPV Division, (float)229 """230 query0 = {"$group": {"_id": "$_id.cpv_division", "sumOfSum": {"$sum": "$sumValueBEuFundsNo"},231 "sumOfCounts": {"$sum": "$countBEuFundsNo"}}}232 query1 = {"$project": {"_id": "$_id", "avgValueEuro_Funds": {"$divide": ["$sumOfSum", "$sumOfCounts"]}}}233 query2 = {"$sort": {"avgValueEuro_Funds": -1}}234 query3 = {"$limit": 5}235 pipeline_6 = [year_country_filter(bot_year, top_year, country_list), query0, query1, query2, query3, lookup_cpv,236 unwind_cpv]237 myCursor = db.cpv_euro_avg.aggregate(pipeline_6)238 list_documents = []239 for document in myCursor:240 list_documents.append({"cpv": document['cpv_lookup']['cpv_division_description'],241 "avg": document['avgValueEuro_Funds']})242 return list_documents243def ex7_cpv_map(bot_year=2008, top_year=2020, country_list=countries):244 """245 Returns the highest CPV Division on average 'VALUE_EURO' per country 'ISO_COUNTRY_CODE'246 Result filterable by floor year, roof year and country_list247 Expected Output (list of documents):248 [{cpv: value_1, avg: value_2, country: value_3}, ....]249 Where:250 value_1 = CPV Division description, (string) (located in cpv collection as 'cpv_division_description')251 value_2 = highest CPV Division average 'VALUE_EURO' of country, (float)252 value_3 = country in ISO-A2 format (string) (located in iso_codes collection)253 """254 query0 = {"$group": {"_id": {"cpv": "$_id.cpv_division", "country": "$_id.country"}, "sumOfSum": {"$sum": "$sum"},255 "sumOfCounts": {"$sum": "$count"}}}256 query1 = {"$project": {"_id": "$_id", "avgValueEuro": {"$divide": ["$sumOfSum", "$sumOfCounts"]}}}257 query2 = {"$sort": {"_id.country": 1, "avgValueEuro": -1}}258 query3 = {"$group": {"_id": "$_id.country", "maxAvgValueEuro": {"$first": "$avgValueEuro"},259 "cpv_division": {"$first": "$_id.cpv"}}}260 lookup_cpv_7 = {"$lookup": {261 "from": "cpv_grouped",262 "localField": "cpv_division",263 "foreignField": "_id.cpv_division",264 "as": "cpv_lookup"265 }}266 lookupCountry = { "$lookup": {267 "from": "iso_codes",268 "localField": "_id",269 "foreignField": "alpha-2",270 "as": "country_iso"271 }272 }273 pipeline_7 = [year_country_filter(bot_year, top_year, country_list), query0, query1, query2, query3, lookup_cpv_7,274 unwind_cpv, lookupCountry]275 myCursor = db.cpv_euro_avg.aggregate(pipeline_7)276 list_documents = []277 for document in myCursor:278 if document['_id'] != "UK":279 list_documents.append({"cpv": document['cpv_lookup']['cpv_division_description'],280 "avg": document['maxAvgValueEuro'],281 "country": document['country_iso'][0]['name']282 })283 else:284 list_documents.append({"cpv": document['cpv_lookup']['cpv_division_description'],285 "avg": document['maxAvgValueEuro'],286 "country":document["_id"]})287 return list_documents288def ex8_cpv_hist(bot_year=2008, top_year=2020, country_list=countries, cpv='50'):289 """290 Produce an histogram where each bucket has the contract counts of a particular cpv291 in a given range of values (bucket) according to 'VALUE_EURO'292 Choose 10 buckets of any partition293 Buckets Example:294 0 to 100000295 100000 to 200000296 200000 to 300000297 300000 to 400000298 400000 to 500000299 500000 to 600000300 600000 to 700000301 700000 to 800000302 800000 to 900000303 900000 to 1000000304 So given a CPV Division code (two digit string) return a list of documents where each document as the bucket _id,305 and respective bucket count.306 Result filterable by floor year, roof year and country_list307 Expected Output (list of documents):308 [{bucket: value_1, count: value_2}, ....]309 Where:310 value_1 = lower limit of respective bucket (if bucket position 0 of example then bucket:0 )311 value_2 = contract count for thar particular bucket, (int)312 """313 query1 = {314 "$bucket": {315 "groupBy": "$VALUE_EURO", # // Field to group by316 "boundaries": [0, 1000000, 2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000, 9000000,317 10000000], # // Boundaries318 "default": ">10Mil", # // Bucket id for documents which do not fall into a bucket319 "output": { # // Output for each bucket320 "count": {"$sum": 1}}}}321 pipeline_8 = [year_country_cpv_filter(bot_year, top_year, country_list,cpv),query1]322 myCursor = db.cpv_bucket.aggregate(pipeline_8)323 list_documents = []324 for document in myCursor:325 list_documents.append({"bucket": document['_id'],326 "count": document['count']})327 return list_documents328def ex9_cpv_bar_diff(bot_year=2008, top_year=2020, country_list=countries):329 """330 Returns the average time and value difference for each CPV, return the highest 5 cpvs331 time difference = 'DT-DISPATCH' - 'DT-AWARD'332 value difference = 'AWARD_VALUE_EURO' - 'VALUE_EURO'333 Result filterable by floor year, roof year and country_list334 Expected Output (list of documents):335 [{cpv: value_1, time_difference: value_2, value_difference: value_3}, ....]336 Where:337 value_1 = CPV Division description, (string) (located in cpv collection as 'cpv_division_description')338 value_2 = average 'DT-DISPACH' - 'DT-AWARD', (float)339 value_3 = average 'EURO_AWARD' - 'VALUE_EURO' (float)340 """341 query1 = {"$group": {"_id": "$CPV",342 "avgValueDiff": {"$avg": "$value"}, "avgTimeDiff": {"$avg": "$time"}}}343 query2 = {"$sort": {"avgTimeDiff": -1}}344 query3 = {"$limit": 5}345 pipeline_9 = [year_country_cpv_filter(bot_year, top_year, country_list), query1, query2,query3, lookup_cpv,346 unwind_cpv]347 myCursor = db.cpv_time_diff_2.aggregate(pipeline_9)348 list_documents = []349 for document in myCursor:350 list_documents.append({"cpv": document['cpv_lookup']['cpv_division_description'],351 "time_difference": document['avgTimeDiff'],352 "value_difference":document['avgValueDiff']})353 return list_documents354def ex10_country_box(bot_year=2008, top_year=2020, country_list=countries):355 """356 We want five numbers, described below357 Result filterable by floor year, roof year and country_list358 Expected Output:359 (avg_country_euro_avg, avg_country_count, avg_country_offer_avg, avg_country_euro_avg_y_eu, avg_country_euro_avg_n_eu)360 Where:361 avg_country_euro_avg = average value of each countries ('ISO_COUNTRY_CODE') contracts average 'VALUE_EURO', (int)362 avg_country_count = average value of each countries ('ISO_COUNTRY_CODE') contract count, (int)363 avg_country_offer_avg = average value of each countries ('ISO_COUNTRY_CODE') contracts average NUMBER_OFFERS', (int)364 avg_country_euro_avg_y_eu = average value of each countries ('ISO_COUNTRY_CODE') contracts average VALUE_EURO' with 'B_EU_FUNDS', (int)365 avg_country_euro_avg_n_eu = average value of each countries ('ISO_COUNTRY_CODE') contracts average 'VALUE_EURO' with out 'B_EU_FUNDS' (int)366 """367 avg_country_euro_avg = 0368 avg_country_count = 0369 avg_country_offer_avg = 0370 avg_country_euro_avg_y_eu = 0371 avg_country_euro_avg_n_eu = 0372 filter_match = year_country_filter(bot_year, top_year, country_list)373 temp_filtered_country = { "$out" : "temp_filtered_country" }374 num_countries=len(country_list)375 db.business_euro_sum.aggregate([filter_match, temp_filtered_country])376 pre = {"$project": {377 "sum_no": 1,378 "count": 1,379 "sum": 1,380 "sumValueBEuFundsNo": 1,381 "sumValueBEuFundsYes": 1,382 "country": "$_id.country",383 "year": "$_id.year",384 "address": "$_id.address",385 "business": "$_id.business"386 }387 }388 # 10.1389 countryAvg_1 = {"$group": {"_id": {"country": "$country"}, "averageAmount": {"$sum": "$sum"}, "count": {"$sum": "$count"}}}390 pipeline_10_1 = [pre, countryAvg_1]391 myCursor = db.temp_filtered_country.aggregate(pipeline_10_1)392 for document in myCursor:393 # print(document)394 avg_country_euro_avg += document['averageAmount'] / document['count']395 avg_country_euro_avg/=num_countries396 397 # 10.2398 countryAvg_1 = {"$group": {"_id": {"country": "$country"}, "count": {"$sum": "$count"}}}399 countryAvg_2 = {"$group": {"_id": {}, "count": {"$avg": "$count"}}}400 pipeline_10_2 = [pre, countryAvg_1, countryAvg_2]401 myCursor = db.temp_filtered_country.aggregate(pipeline_10_2)402 for document in myCursor:403 avg_country_count = document['count']404 # 10.3405 countryAvg_1 = {"$group": {"_id": {"country": "$country"}, "count": {"$sum": "$sum_no"}}}406 countryAvg_2 = {"$group": {"_id": {}, "count": {"$avg": "$count"}}}407 pipeline_10_3 = [pre, countryAvg_1, countryAvg_2]408 myCursor = db.temp_filtered_country.aggregate(pipeline_10_3)409 for document in myCursor:410 avg_country_offer_avg = document['count']411 # 10.4412 countryAvg_1 = {"$group": {"_id": {"country": "$country"}, "sum": {"$sum": "$sumValueBEuFundsYes"},413 "count": {"$sum": "$count"}}}414 pipeline_10_4= [pre, countryAvg_1]415 myCursor = db.temp_filtered_country.aggregate(pipeline_10_4)416 for document in myCursor:417 avg_country_euro_avg_y_eu += document['sum'] / document['count']418 avg_country_euro_avg_y_eu/=num_countries419 420 # 10.5421 countryAvg_1 = {"$group": {"_id": {"country": "$country"}, "sum": {"$sum": "$sumValueBEuFundsNo"}, "count": {"$sum": "$count"}}}422 pipeline_10_5 = [pre, countryAvg_1]423 myCursor = db.temp_filtered_country.aggregate(pipeline_10_5)424 for document in myCursor:425 avg_country_euro_avg_n_eu += document['sum'] / document['count']426 avg_country_euro_avg_n_eu/=num_countries427 return avg_country_euro_avg, avg_country_count, avg_country_offer_avg, avg_country_euro_avg_y_eu, avg_country_euro_avg_n_eu428def ex11_country_treemap(bot_year=2008, top_year=2020, country_list=countries):429 """430 Returns the count of contracts per country ('ISO_COUNTRY_CODE')431 Result filterable by floor year, roof year and country_list432 Expected Output (list of documents):433 [{country: value_1, count: value_2}, ....]434 Where:435 value_1 = Country ('ISO_COUNTRY_CODE') name, (string) (located in iso_codes collection')436 value_2 = contract count of each country, (int)437 """438 countryAvg_1 = {"$group": {"_id": {"country": "$country"}, "count": {"$sum": "$count"}}}439 pre = {"$project": {440 "count": 1,441 "country": "$_id.country",442 "year": "$_id.year",443 "address": "$_id.address",444 "business": "$_id.business"445 }446 }447 pipeline_10_1 = [year_country_filter(bot_year, top_year, country_list), pre, countryAvg_1]448 myCursor = db.business_euro_sum.aggregate(pipeline_10_1)449 list_documents = []450 for document in myCursor:451 list_documents.append({"country": document['_id']['country'], "count": document['count']})452 return list_documents453def ex12_country_bar_1(bot_year=2008, top_year=2020, country_list=countries):454 """455 Returns the average 'VALUE_EURO' for each country, return the highest 5 countries456 Result filterable by floor year, roof year and country_list457 Expected Output (list of 5 sorted documents):458 [{country: value_1, avg: value_2}, ....]459 Where:460 value_1 = Country ('ISO_COUNTRY_CODE') name, (string) (located in cpv collection as 'cpv_division_description')461 value_2 = average 'VALUE_EURO' of each country ('ISO_COUNTRY_CODE') name, (float)462 """463 pre = {"$project": {464 "count": 1,465 "sum": 1,466 "country": "$_id.country",467 "year": "$_id.year",468 "address": "$_id.address",469 "business": "$_id.business"470 }471 }472 countryAvg_1 = {"$group": {"_id": {"country": "$country"}, "sum": {"$sum": "$sum"}, "count": {"$sum": "$count"}}}473 countryAvg_2 = {"$project": {"country": 1, "avgEuro": {"$divide": ["$sum", "$count"]}}}474 query2 = {"$sort": {"avgEuro": -1}}475 query3 = {"$limit": 5}476 pipeline_10_1 = [year_country_filter(bot_year, top_year, country_list), pre, countryAvg_1, countryAvg_2, query2, query3]477 myCursor = db.business_euro_sum.aggregate(pipeline_10_1)478 list_documents = []479 for document in myCursor:480 list_documents.append({"country": document['_id']['country'], "avg": document['avgEuro']})481 return list_documents482def ex13_country_bar_2(bot_year=2008, top_year=2020, country_list=countries):483 """484 Group by country and get the average 'VALUE_EURO' for each group, return the lowest, average wise, 5 documents485 Result filterable by floor year, roof year and country_list486 Expected Output (list of 5 sorted documents):487 [{country: value_1, avg: value_2}, ....]488 Where:489 value_1 = Country ('ISO_COUNTRY_CODE') name, (string) (located in cpv collection as 'cpv_division_description')490 value_2 = average 'VALUE_EURO' of each country ('ISO_COUNTRY_CODE') name, (float)491 """492 pre = {"$project": {493 "count": 1,494 "sum": 1,495 "country": "$_id.country",496 "year": "$_id.year",497 "address": "$_id.address",498 "business": "$_id.business"499 }500 }501 countryAvg_1 = {"$group": {"_id": {"country": "$country"}, "sum": {"$sum": "$sum"}, "count": {"$sum": "$count"}}}502 countryAvg_2 = {"$project": {"country": 1, "avgEuro": {"$divide": ["$sum", "$count"]}}}503 query2 = {"$sort": {"avgEuro": 1}}504 query3 = {"$limit": 5}505 pipeline_10_1 = [year_country_filter(bot_year, top_year, country_list), pre, countryAvg_1, countryAvg_2, query2, query3]506 myCursor = db.business_euro_sum.aggregate(pipeline_10_1)507 list_documents = []508 for document in myCursor:509 list_documents.append({"country": document['_id']['country'], "avg": document['avgEuro']})510 return list_documents511def ex14_country_map(bot_year=2008, top_year=2020, country_list=countries):512 """513 For each country get the sum of the respective contracts 'VALUE_EURO' with 'B_EU_FUNDS'514 Result filterable by floor year, roof year and country_list515 Expected Output (list of documents):516 [{sum: value_1, country: value_2}, ....]517 Where:518 value_1 = sum 'VALUE_EURO' of country ('ISO_COUNTRY_CODE') name, (float)519 value_2 = country in ISO-A2 format (string) (located in iso_codes collection)520 """521 pre = {"$project": {522 "sumValueBEuFundsYes": 1,523 "country": "$_id.country",524 "year": "$_id.year",525 "address": "$_id.address",526 "business": "$_id.business"527 }528 }529 lookupCountry = { "$lookup": {530 "from": "iso_codes",531 "localField": "_id.country",532 "foreignField": "alpha-2",533 "as": "country_iso"534 }535 }536 countryAvg_1 = {"$group": {"_id": {"country": "$country"}, "sum": {"$sum": "$sumValueBEuFundsYes"}}}537 pipeline_10_1 = [year_country_filter(bot_year, top_year, country_list), pre, countryAvg_1, lookupCountry]538 myCursor = db.business_euro_sum.aggregate(pipeline_10_1)539 list_documents = []540 541 for document in myCursor:542 if document["_id"]["country"]!="UK":543 list_documents.append({544 "sum": document['sum'],545 "country": document['country_iso'][0]['name'],546 })547 else:548 list_documents.append({549 "sum": document['sum'],550 "country": "United Kingdom of Great Britain and Northern Ireland",551 })552 return list_documents553def ex15_business_box(bot_year=2008, top_year=2020, country_list=countries):554 """555 We want five numbers, described below556 Result filterable by floor year, roof year and country_list557 Expected Output:558 (avg_business_euro_avg, avg_business_count, avg_business_offer_avg, avg_business_euro_avg_y_eu, avg_business_euro_avg_n_eu)559 Where:560 avg_business_euro_avg = average value of each company ('CAE_NAME') contracts average 'VALUE_EURO', (int)561 avg_business_count = average value of each company ('CAE_NAME') contract count, (int)562 avg_business_offer_avg = average value of each company ('CAE_NAME') contracts average NUMBER_OFFERS', (int)563 avg_business_euro_avg_y_eu = average value of each company ('CAE_NAME') contracts average VALUE_EURO' with 'B_EU_FUNDS', (int)564 avg_business_euro_avg_n_eu = average value of each company ('CAE_NAME') contracts average 'VALUE_EURO' with out 'B_EU_FUNDS' (int)565 """566 avg_business_euro_avg = None567 avg_business_count = None568 avg_business_offer_avg = None569 avg_business_euro_avg_y_eu = None570 avg_business_euro_avg_n_eu = None571 filter_match = year_country_filter(bot_year, top_year, country_list)572 temp_filtered = { "$out" : "temp_filtered" }573 db.business_euro_sum.aggregate([filter_match, temp_filtered])574 # Question 15.1575 # the queries below calculates the avg of each company ('CAE_NAME') contracts avg 'VALUE_EURO', (int)576 businessAvg_0 = { "$group": { "_id": {"per_business_sum": "$_id.business"},577 "total_sum": {"$sum": "$sum"},578 "total_count": {"$sum": "$count"},579 }580 }581 582 businessAvg_1 = { "$project": {"total_sum": 1,583 "total_count": 1,584 "per_business_avg": { "$divide": [ "$total_sum", "$total_count" ] },585 }586 }587 businessAvg_2 = { "$group": { "_id": "avg_business_euro_avg", "averageAmount": {"$avg": "$per_business_avg"} } }588 # resulting pipeline589 pipeline_15_1 = [businessAvg_0, businessAvg_1, businessAvg_2]590 # cursor from the pipeline above591 # it assumes the existence of a pre-calculated collection (defined in insert_operation)592 myCursor = db.temp_filtered.aggregate(pipeline_15_1)593 for document in myCursor:594 avg_business_euro_avg = document['averageAmount']595 # Question 15.2596 # the query below calculates the average of each company's contract count597 businessAvg_3 = { "$group": { "_id": "avg_business_count", "averageCount": {"$avg": "$total_count"} } }598 # resulting pipeline599 pipeline_15_2 = [businessAvg_0, businessAvg_3]600 # cursor from the pipeline above601 # it assumes the existence of a pre-calculated collection (defined in insert_operation)602 myCursor = db.temp_filtered.aggregate(pipeline_15_2)603 for document in myCursor:604 avg_business_count = document['averageCount']605 # Question 15.3606 # the queries below calculate the avarage of each company's contracts average NUMBER_OFFERS607 businessAvg_4 = { "$group": { "_id": {"per_business_sum_no": "$_id.business"},608 "total_sum_no": {"$sum": "$sum_no"},609 "total_count": {"$sum": "$count"},610 }611 }612 613 businessAvg_5 = { "$project": {"total_sum_no": 1,614 "total_count": 1,615 "per_business_avg_no": { "$divide": [ "$total_sum_no", "$total_count" ] },616 }617 }618 businessAvg_6 = { "$group": { "_id": "avg_business_offer_avg", "averageNumberOffers": {"$avg": "$per_business_avg_no"} } }619 # resulting pipeline620 pipeline_15_3 = [businessAvg_4, businessAvg_5, businessAvg_6]621 # cursor from the pipeline above622 # it assumes the existence of a pre-calculated collection (defined in insert_operation)623 myCursor = db.temp_filtered.aggregate(pipeline_15_3)624 for document in myCursor:625 avg_business_offer_avg = document['averageNumberOffers']626 # Question 15.4627 # the query below calculates the avarage of each company's contracts average 'VALUE_EUROâ with âB_EU_FUNDSâ628 businessAvg_7 = { "$group": { "_id": {"per_business_sumValueBEuFundsYes": "$_id.business"},629 "total_sumValueBEuFundsYes": {"$sum": "$sumValueBEuFundsYes"},630 "total_count": {"$sum": "$count"},631 }632 }633 634 businessAvg_8 = { "$project": {"total_sumValueBEuFundsYes": 1,635 "total_count": 1,636 "per_business_avgValueBEuFundsYes": { "$divide": [ "$total_sumValueBEuFundsYes", "$total_count" ] },637 }638 }639 businessAvg_9 = { "$group": { "_id": "avg_business_euro_avg_y_eu", "averageAmountYes": {"$avg": "$per_business_avgValueBEuFundsYes"} } }640 # resulting pipeline641 pipeline_15_4 = [businessAvg_7, businessAvg_8, businessAvg_9]642 # cursor from the pipeline above643 # it assumes the existence of a pre-calculated collection (defined in insert_operation)644 myCursor = db.temp_filtered.aggregate(pipeline_15_4)645 for document in myCursor:646 avg_business_euro_avg_y_eu = document['averageAmountYes']647 # Question 15.5648 # the query below calculates the avarage of each company's contracts average 'VALUE_EUROâ without âB_EU_FUNDSâ649 businessAvg_10 = { "$group": { "_id": {"per_business_sumValueBEuFundsNo": "$_id.business"},650 "total_sumValueBEuFundsNo": {"$sum": "$sumValueBEuFundsNo"},651 "total_count": {"$sum": "$count"},652 }653 }654 655 businessAvg_11 = { "$project": {"total_sumValueBEuFundsNo": 1,656 "total_count": 1,657 "per_business_avgValueBEuFundsNo": { "$divide": [ "$total_sumValueBEuFundsNo", "$total_count" ] },658 }659 }660 businessAvg_12 = { "$group": { "_id": "avg_business_euro_avg_n_eu", "averageAmountNo": {"$avg": "$per_business_avgValueBEuFundsNo"} } }661 # resulting pipeline662 pipeline_15_5 = [businessAvg_10, businessAvg_11, businessAvg_12]663 # cursor from the pipeline above664 # it assumes the existence of a pre-calculated collection (defined in insert_operation)665 myCursor = db.temp_filtered.aggregate(pipeline_15_5)666 for document in myCursor:667 avg_business_euro_avg_n_eu = document['averageAmountNo']668 return avg_business_euro_avg, avg_business_count, avg_business_offer_avg, avg_business_euro_avg_y_eu, avg_business_euro_avg_n_eu669def ex16_business_bar_1(bot_year=2008, top_year=2020, country_list=countries):670 """671 Returns the average 'VALUE_EURO' for company ('CAE_NAME') return the highest 5 companies672 Result filterable by floor year, roof year and country_list673 Expected Output (list of 5 sorted documents):674 [{company: value_1, avg: value_2}, ....]675 Where:676 value_1 = company ('CAE_NAME') name, (string)677 value_2 = average 'VALUE_EURO' of each company ('CAE_NAME'), (float)678 """679 # Question 16680 # the queries below sort (descending) company's contracts average 'VALUE_EURO', (int)681 businessAvg_0 = { "$group": { "_id": {"per_business_sum": "$_id.business"},682 "total_sum": {"$sum": "$sum"},683 "total_count": {"$sum": "$count"},684 }685 }686 687 businessAvg_1 = { "$project": {"total_sum": 1,688 "total_count": 1,689 "per_business_avg": { "$divide": [ "$total_sum", "$total_count" ] },690 }691 }692 sortedAvg = { "$sort" : { "per_business_avg" : -1} }693 694 limitSorted = { "$limit" : 5 }695 # resulting pipeline696 pipeline_16 = [year_country_filter(bot_year, top_year, country_list),697 businessAvg_0, businessAvg_1, sortedAvg, limitSorted]698 #pipeline_16 = [ businessAvg_0, businessAvg_1, sortedAvg, limitSorted]699 # cursor from the pipeline above700 # it assumes the existence of a pre-calculated collection (defined in insert_operation)701 myCursor = db.business_euro_sum.aggregate(pipeline_16)702 list_documents = []703 for document in myCursor:704 list_documents.append({"company": document['_id']['per_business_sum'], "avg": document['per_business_avg']})705 return list_documents706def ex17_business_bar_2(bot_year=2008, top_year=2020, country_list=countries):707 """708 Returns the average 'VALUE_EURO' for company ('CAE_NAME') return the lowest 5 companies709 Result filterable by floor year, roof year and country_list710 Expected Output (list of 5 sorted documents):711 [{company: value_1, avg: value_2}, ....]712 Where:713 value_1 = company ('CAE_NAME') name, (string)714 value_2 = average 'VALUE_EURO' of each company ('CAE_NAME'), (float)715 """716 # Question 17717 # the queries below sort (ascending) company's contracts average 'VALUE_EURO', (int)718 businessAvg_0 = { "$group": { "_id": {"per_business_sum": "$_id.business"},719 "total_sum": {"$sum": "$sum"},720 "total_count": {"$sum": "$count"},721 }722 }723 724 businessAvg_1 = { "$project": {"total_sum": 1,725 "total_count": 1,726 "per_business_avg": { "$divide": [ "$total_sum", "$total_count" ] },727 }728 }729 sortedAvg = { "$sort" : { "per_business_avg" : 1} }730 731 limitSorted = { "$limit" : 5 }732 # resulting pipeline733 pipeline_17 = [year_country_filter(bot_year, top_year, country_list),734 businessAvg_0, businessAvg_1, sortedAvg, limitSorted]735 # cursor from the pipeline above736 # it assumes the existence of a pre-calculated collection (defined in insert_operation)737 myCursor = db.business_euro_sum.aggregate(pipeline_17)738 list_documents = []739 for document in myCursor:740 list_documents.append({"company": document['_id']['per_business_sum'], "avg": document['per_business_avg']})741 return list_documents742def ex18_business_treemap(bot_year=2008, top_year=2020, country_list=countries):743 """744 We want the count of contracts for each company 'CAE_NAME', for the highest 15745 Result filterable by floor year, roof year and country_list746 Expected Output (list of documents):747 [{company: value_1, count: value_2}, ....]748 Where:749 value_1 = company ('CAE_NAME'), (string)750 value_2 = contract count of each company ('CAE_NAME'), (int)751 """752 # Question 18753 # the query below sorts (ascending) each company's count of contracts754 sortedCount = { "$sort" : { "count" : -1} }755 limitSorted = { "$limit" : 15 }756 # resulting pipeline757 pipeline_18 = [year_country_filter(bot_year, top_year, country_list), sortedCount, limitSorted]758 # pipeline_18 = [ sortedCount, limitSorted]759 # cursor from the pipeline above760 # it assumes the existence of a pre-calculated collection (defined in insert_operation)761 myCursor = db.business_euro_sum.aggregate(pipeline_18)762 list_documents = []763 for document in myCursor:764 list_documents.append({"company": document['_id']['business'], "count": document['count']})765 return list_documents766def ex19_business_map(bot_year=2008, top_year=2020, country_list=countries):767 """768 For each country get the highest company ('CAE_NAME') in terms of 'VALUE_EURO' sum contract spending769 Result filterable by floor year, roof year and country_list770 Expected Output (list of documents):771 [{company: value_1, sum: value_2, country: value_3, address: value_4}, ....]772 Where:773 value_1 = 'top' company of that particular country ('CAE_NAME'), (string)774 value_2 = sum 'VALUE_EURO' of country and company ('CAE_NAME'), (float)775 value_3 = country in ISO-A2 format (string) (located in iso_codes collection)776 value_4 = company ('CAE_NAME') address, single string merging 'CAE_ADDRESS' and 'CAE_TOWN' separated by ' ' (space)777 """778 # Question 19779 sortedSum = { "$sort": { "_id.country" : 1, "sum" : -1} }780 781 firstInCountry = { "$group": {782 "_id": "$_id.country", 783 "sumEuro": { "$first": "$sum" },784 "businessName": { "$first": "$_id.business"},785 "businessAddress": { "$first": "$_id.address"},786 }787 }788 789 lookupCountry = { "$lookup": {790 "from": "iso_codes",791 "localField": "_id",792 "foreignField": "alpha-2",793 "as": "country_iso"794 }795 }796 # resulting pipeline797 pipeline_19 = [year_country_filter(bot_year, top_year, country_list), sortedSum, firstInCountry, lookupCountry]798 # cursor from the pipeline above799 # it assumes the existence of a pre-calculated collection (defined in insert_operation)800 myCursor = db.business_euro_sum.aggregate(pipeline_19, allowDiskUse = True)801 list_documents = []802 803 for document in myCursor:804 if document['_id'] != "UK":805 list_documents.append({"company": document['businessName'],806 "sum": document['sumEuro'],807 "country": document['country_iso'][0]['name'],808 "address": document['businessAddress']809 })810 else:811 list_documents.append({"company": document['businessName'],812 "sum": document['sumEuro'],813 "country": "United Kingdom of Great Britain and Northern Ireland",814 "address": document['businessAddress']815 })816 return list_documents817def ex20_business_connection(bot_year=2008, top_year=2020, country_list=countries):818 """819 We want the top 5 most frequent co-occurring companies ('CAE_NAME' and 'WIN_NAME')820 Result filterable by floor year, roof year and country_list821 Expected Output (list of documents):822 [{companies: value_1, count: value_2}, ....]823 Where:824 value_1 = company ('CAE_NAME') string merged with company ('WIN_NAME') seperated by the string ' with ', (string)825 value_2 = co-occurring number of contracts (int)826 """827 828 group = {"$group": {"_id": {829 "name": "$_id.co_name",830 },831 "total":{"$sum":"$count"},832 }833 }834 835 sort = { "$sort" : { "total" : -1} }836 limit_20 = { "$limit" : 5 }837 838 pipeline_20 = [year_country_filter(bot_year, top_year, country_list), group, sort, limit_20]839 myCursor = db.business_co_occurrences.aggregate(pipeline_20, allowDiskUse = True)840 list_documents = []841 for document in myCursor:842 list_documents.append({"companies": document['_id']['name'], "count": document['total']})843 return list_documents844def insert_operation(document):845 '''846 Insert operation.847 In case pre computed tables were generated for the queries they should be recomputed with the new data.848 '''849 inserted_ids = eu.insert_many(document).inserted_ids850 # Apply the pipeline to set 'CPV_DIVISION' in the EU collection. 851 eu.update_many({}, precomputing.pipeline_add_cpv_division)852 # apply pipeline for 1-7853 eu.aggregate(precomputing.pipeline_q1_to_q7, allowDiskUse=True)854 855 # apply pipline for 8856 eu.aggregate(precomputing.pipeline_q8, allowDiskUse=True)857 # apply pipeline for 9858 eu.aggregate(precomputing.pipeline_q9, allowDiskUse=True)859 # Apply the pipeline to precompute the average VALUE_EURO for each combination860 # of CAE_NAME, YEAR and ISO_COUNTRY_CODE861 # (collection BUSINESS_EURO_SUM)862 eu.aggregate(precomputing.pipeline_companies_10_to_19, allowDiskUse = True)863 # Apply the pipeline to precompute the count of contracts for each combination864 # of CAE_NAME and WIN_NAME, year and country. The _id contains 'co_name' which is an aggregation of865 # CAE_NAME and WIN_NAME, year and country of the contracts.866 # (collection BUSINESS_CO_OCCURRENCES)867 eu.aggregate(precomputing.pipeline_co_occurrences_20, allowDiskUse = True)868 return inserted_ids869query_list = [870 ex1_cpv_box, ex2_cpv_treemap, ex3_cpv_bar_1, ex4_cpv_bar_2,871 ex5_cpv_bar_3, ex6_cpv_bar_4, ex7_cpv_map, ex8_cpv_hist ,ex9_cpv_bar_diff,872 ex10_country_box, ex11_country_treemap, ex12_country_bar_1,873 ex13_country_bar_2, ex14_country_map, ex15_business_box,874 ex16_business_bar_1, ex17_business_bar_2, ex18_business_treemap,875 ex19_business_map, ex20_business_connection...
test_list_documents.py
Source:test_list_documents.py
1from test import get_user_session, cassette, sleep2from test.resources.documents import create_document, assert_core_document, delete_all_documents, assert_bib_document3def test_should_list_documents():4 session = get_user_session()5 delete_all_documents()6 with cassette('fixtures/resources/documents/list_documents/list_documents.yaml'):7 create_document(session)8 page = session.documents.list()9 assert len(page.items) == 110 assert page.count == 111 assert_core_document(page.items[0])12def test_should_page_through_documents():13 session = get_user_session()14 delete_all_documents()15 with cassette('fixtures/resources/documents/list_documents/page_through_documents.yaml'):16 create_document(session, 'title 1')17 create_document(session, 'title 2')...
tfidf.py
Source:tfidf.py
1import math2terms = {}3term_frequency = {}4inverse_frequency = {}5def tf ():6 total_terms = 07 for value in terms.keys():8 total_terms += terms[value]9 global term_frequency10 for term in terms:11 term_frequency[term] = terms[term] / total_terms12def idf (list_documents):13 global inverse_frequency14 num_documents = len(list_documents)15 global terms16 for term in terms:17 # number of documents with term t 18 cont = 019 for doc in list_documents:20 if term in doc:21 cont += 122 inverse_frequency[term] = math.log10(num_documents/cont)23 24def add_term (term):25 global terms26 if term in terms:27 terms[term] = terms[term] + 128 else:29 terms[term] = 130def tfidf (list_documents):31 #print (len(list_documents))32 # populate the list of terms occurrences33 global terms34 terms = {}35 for document in list_documents:36 [add_term(term) for term in document] 37 38 tf()39 idf(list_documents)40 global term_frequency, inverse_frequency41 tfidf = {}42 for term in terms.keys():43 tfidf[term] = "{0:.6f}".format(term_frequency[term] * inverse_frequency[term])44 ...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!