Best Python code snippet using avocado_python
data_split.py
Source:data_split.py
1# Note: use to preprocess simulated or real HSC galaxies2# We will split the simulation dataset into five subsets:3# 1.training set for GaMorNet 4# 2.validation set for GaMorNet 5# 3.training set for PSFGAN 6# 4.validation set for PSFGAN 7# 5.common test set for GaMorNet + PSFGAN 8# Modified from "sim_gal_preprocess.py"9import argparse10import os11import glob12import pandas13import numpy as np14import random15from astropy.io import fits16# Paths17core_path = '/gpfs/loomis/project/urry/ct564/HSC/PSFGAN/'18galaxy_main = core_path + 'dimauro_0.5_1.0/'19# Other parameters20# Order doesn't matter (e.g. ['g', 'r'] is the same as ['r', 'g'])21filter_strings = ['g', 'r', 'i', 'z', 'y']22# flux conversion parameters 23# i.e. [flux in nanoJy] * nJy_to_adu_per_AA = [flux in adu]24# HSC uses nanoJy; GalSim uses adu25# HSCWide-G: radius=4.1m, exp_time=10min, quantum efficiency=0.864, gain=3.0, lambda effctive=4754 Angstrom26# HSCWide-R: radius=4.1m, exp_time=10min, quantum efficiency=0.956, gain=3.0, lambda effctive=6175 Angstrom27# HSCWide-I: radius=4.1m, exp_time=20min, quantum efficiency=0.882, gain=3.0, lambda effctive=7711 Angstrom28# HSCWide-Z: radius=4.1m, exp_time=20min, quantum efficiency=0.821, gain=3.0, lambda effctive=8898 Angstrom29# HSCWide-Y: radius=4.1m, exp_time=20min, quantum efficiency=0.517, gain=3.0, lambda effctive=9762 Angstrom30#nJy_to_adu_per_AA_filters = [0.0289698414, 0.0246781434, 0.0364652697, 0.0294152337, 0.0168839201]31nJy_to_adu_per_AA_filters = [0.0289698414, 0.0246781434, 0.0364652697, 0.0294152337, 0.0168839201]32 33# The desired image shape. Images of other shapes will not pass the selection (thus be filtered out)34desired_shape = [239, 239]35parser = argparse.ArgumentParser()36def data_split():37 # Make the split predictable38 np.random.seed(42)39 parser.add_argument("--gmn_train", default=6300)40 parser.add_argument("--gmn_eval", default=700)41 parser.add_argument("--psf_train", default=4500)42 parser.add_argument("--psf_eval", default=500)43 parser.add_argument("--test", default=1528)44 parser.add_argument("--shuffle", default="1")45 # Identify source of the raw data. This will determine the names of columns in catalogs being created46 # Options: "sim_hsc_0_0.25", "simard_cross_hsc"...(more to be added)47 parser.add_argument("--source", default="dimauro_0.5_1.0")48 parser.add_argument("--split", default="unequal")49 args = parser.parse_args()50 gmn_train = int(args.gmn_train)51 gmn_eval = int(args.gmn_eval)52 psf_train = int(args.psf_train)53 psf_eval = int(args.psf_eval)54 test = int(args.test)55 shuffle = bool(int(args.shuffle))56 source = str(args.source)57 split = str(args.split)58 num_filters = len(filter_strings)59 num_total = 060 num_gmn_train = 061 num_gmn_eval = 062 num_psf_train = 063 num_psf_eval = 064 num_test = 065 num_resized = 066 num_correctly_resized = 067 num_negative_flux = 068 # Input and output locations69 hsc_folders = []70 hsc_catalogs = []71 gmn_train_folders = []72 gmn_eval_folders = []73 psf_train_folders = []74 psf_eval_folders = []75 test_folders = []76 gmn_train_catalogs = []77 gmn_eval_catalogs = []78 psf_train_catalogs = []79 psf_eval_catalogs = []80 test_catalogs = []81 for filter_string in filter_strings:82 galaxy_per_filter = galaxy_main + filter_string + '-band/'83 hsc_folder = glob.glob(galaxy_per_filter + 'raw_data/images/')[0]84 hsc_catalog = pandas.read_csv(glob.glob(galaxy_per_filter + 'raw_data/*.csv')[0])85 gmn_train_folder = galaxy_per_filter + 'gmn_train/'86 gmn_eval_folder = galaxy_per_filter + 'gmn_eval/'87 psf_train_folder = galaxy_per_filter + 'fits_train/'88 psf_eval_folder = galaxy_per_filter + 'fits_eval/'89 test_folder = galaxy_per_filter + 'fits_test/'90 if not os.path.exists(gmn_train_folder):91 os.makedirs(gmn_train_folder)92 if not os.path.exists(gmn_eval_folder):93 os.makedirs(gmn_eval_folder)94 if not os.path.exists(psf_train_folder):95 os.makedirs(psf_train_folder)96 if not os.path.exists(psf_eval_folder):97 os.makedirs(psf_eval_folder)98 if not os.path.exists(test_folder):99 os.makedirs(test_folder)100 if (source == "gal_sim_0_0.25") or (source == "gal_sim_0.25_0.5") or (source == "gal_sim_0.5_0.75") or (source == "gal_sim_0.5_1.0"):101 column_list = ['object_id', 'num_components', 'sersic_idx_d', 'R_e_d', 'axis_ratio_d', 'PA_d', 'flux_frac_d',102 'sersic_idx_b', 'R_e_b', 'axis_ratio_b', 'PA_b', 'flux_frac_b',103 filter_string + '_total_flux']104 gmn_train_catalog = pandas.DataFrame(columns=column_list)105 gmn_eval_catalog = pandas.DataFrame(columns=column_list)106 psf_train_catalog = pandas.DataFrame(columns=column_list)107 psf_eval_catalog = pandas.DataFrame(columns=column_list)108 test_catalog = pandas.DataFrame(columns=column_list)109 elif source == "simard":110 column_list = ['object_id', 'ra', 'dec', 'photoz_best', 'SClass', 'z', 'Scale', 'Rhlg', 'Rhlr', 'Rchl,g', 'Rchl,r',111 '(B/T)g', 'e(B/T)g', '(B/T)r', 'e(B/T)r',112 filter_string + '_total_flux']113 gmn_train_catalog = pandas.DataFrame(columns=column_list)114 gmn_eval_catalog = pandas.DataFrame(columns=column_list)115 psf_train_catalog = pandas.DataFrame(columns=column_list)116 psf_eval_catalog = pandas.DataFrame(columns=column_list)117 test_catalog = pandas.DataFrame(columns=column_list)118 elif (source == "dimauro_0_0.5") or (source == "dimauro_0.5_0.75") or (source == "dimauro_0.5_1.0"):119 column_list = ['object_id', 'ra', 'dec', 'photoz_best', 'RE_F606', 'RE_F814', 'RE_F125', 'RE_F160',120 'N_F606', 'N_F814', 'N_F125', 'N_F160', 'B_T_m',121 filter_string + '_total_flux']122 gmn_train_catalog = pandas.DataFrame(columns=column_list)123 gmn_eval_catalog = pandas.DataFrame(columns=column_list)124 psf_train_catalog = pandas.DataFrame(columns=column_list)125 psf_eval_catalog = pandas.DataFrame(columns=column_list)126 test_catalog = pandas.DataFrame(columns=column_list)127 hsc_folders.append(hsc_folder)128 hsc_catalogs.append(hsc_catalog)129 gmn_train_folders.append(gmn_train_folder)130 gmn_eval_folders.append(gmn_eval_folder)131 psf_train_folders.append(psf_train_folder)132 psf_eval_folders.append(psf_eval_folder)133 test_folders.append(test_folder)134 gmn_train_catalogs.append(gmn_train_catalog)135 gmn_eval_catalogs.append(gmn_eval_catalog)136 psf_train_catalogs.append(psf_train_catalog)137 psf_eval_catalogs.append(psf_eval_catalog)138 test_catalogs.append(test_catalog)139 # Main loop140 # Start the loop by iterating over the row number based on the first catalog from hsc_catalogs141 row_num_list = list(range(2, len(hsc_catalogs[0]) + 2))142 143 # Equal or unequal data split144 # When using "unequal" split, please make sure "hsc_catalogs[0]" is already labeled.145 if split == "equal":146 if shuffle:147 np.random.shuffle(row_num_list)148 149 elif split == "unequal":150 # Get the bulge list first151 bulge_list = list(hsc_catalogs[0]["is_bulge"])152 num_bulges = np.sum(bulge_list)153 num_non_bulges = len(hsc_catalogs[0]) - num_bulges154 # Then sort "row_num_list" according to "bulge_list" (bulges will be sorted to the bottom)155 row_num_list = [x for _, x in sorted(zip(bulge_list, row_num_list))]156 157 # If shuffle is True:158 # First shuffle subset of bulges and subset of nonbulges159 if shuffle:160 non_bulge_row_num_list = row_num_list[:num_non_bulges]161 bulge_row_num_list = row_num_list[num_non_bulges:]162 np.random.shuffle(non_bulge_row_num_list)163 np.random.shuffle(bulge_row_num_list)164 row_num_list = non_bulge_row_num_list + bulge_row_num_list165 166 # Next shuffle subset of psf_train&psf_eval and subset of gmn_train&gmn_eval&test167 if shuffle:168 psf_row_num_list = row_num_list[:(psf_train+psf_eval)]169 gmn_test_row_num_list = row_num_list[(psf_train+psf_eval):]170 np.random.shuffle(psf_row_num_list)171 np.random.shuffle(gmn_test_row_num_list)172 row_num_list = psf_row_num_list + gmn_test_row_num_list173 174 for row_num in row_num_list:175 if (source == "gal_sim_0_0.25") or (source == "gal_sim_0.25_0.5") or (source == "gal_sim_0.5_0.75") or (source == "gal_sim_0.5_1.0"):176 obj_id = int(row_num - 2)177 elif (source == "simard") or (source == "dimauro_0_0.5") or (source == "dimauro_0.5_0.75") or (source == "dimauro_0.5_1.0"):178 obj_id = int(row_num)179 # Read the images180 images = []181 for i in range(num_filters):182 if (source == "gal_sim_0_0.25") or (source == "gal_sim_0.25_0.5") or (source == "gal_sim_0.5_0.75") or (source == "gal_sim_0.5_1.0"):183 fits_path = '%s/%s.fits' % (hsc_folders[i], obj_id)184 elif (source == "simard") or (source == "dimauro_0_0.5") or (source == "dimauro_0.5_0.75") or (source == "dimauro_0.5_1.0"):185 fits_path = '%s/%s-cutout-*.fits' % (hsc_folders[i], obj_id)186 file = glob.glob(fits_path)[0]187 image = fits.getdata(file)188 images.append(image)189 # Check whether the flux is positive in each filter190 # If not, quit the loop191 positive_flux_booleans = []192 for i in range(num_filters):193 current_row = hsc_catalogs[i].iloc[row_num-2]194 if (source == "gal_sim_0_0.25") or (source == "gal_sim_0.25_0.5") or (source == "gal_sim_0.5_0.75") or (source == "gal_sim_0.5_1.0"):195 total_flux = current_row['total_flux']196 elif (source == "simard") or (source == "dimauro_0_0.5") or (source == "dimauro_0.5_0.75") or (source == "dimauro_0.5_1.0"):197 total_flux = current_row[filter_strings[i] + '_cmodel_flux']198 if total_flux < 0:199 positive_flux_boolean = False200 else:201 positive_flux_boolean = True202 positive_flux_booleans.append(positive_flux_boolean)203 if False in positive_flux_booleans:204 num_negative_flux += 1205 continue206 207 # Check whether the images have desired shapes in each filter208 # If not, resize the image209 desired_shape_booleans = []210 for i in range(num_filters):211 current_shape = list(images[i].shape)212 if not (current_shape[0] == desired_shape[0] and current_shape[1] == desired_shape[1]):213 desired_shape_boolean = False214 215 # Start to resize the first dimension216 if current_shape[0] < desired_shape[0]:217 if (desired_shape[0]-current_shape[0]) % 2 == 0:218 images[i] = np.pad(images[i], (( (desired_shape[0]-current_shape[0])//2, (desired_shape[0]-current_shape[0])//2 ), (0, 0)), 'reflect')219 else: # (desired_shape[0] - current_shape[0]) % 2 == 1:220 images[i] = np.pad(images[i], (( (desired_shape[0]-current_shape[0])//2, (desired_shape[0]-current_shape[0])//2 + 1), (0, 0)), 'reflect')221 elif current_shape[0] > desired_shape[0]:222 if (current_shape[0]-desired_shape[0]) % 2 == 0:223 images[i] = images[i][(current_shape[0]-desired_shape[0])//2 : -((current_shape[0]-desired_shape[0])//2), :]224 else: # (current_shape[0]-desired_shape[0]) % 2 == 1:225 images[i] = images[i][(current_shape[0]-desired_shape[0])//2: -((current_shape[0]-desired_shape[0])//2 + 1), :]226 # Then resize the second dimension227 if current_shape[1] < desired_shape[1]:228 if (desired_shape[1]-current_shape[1]) % 2 == 0:229 images[i] = np.pad(images[i], ((0, 0), ( (desired_shape[1]-current_shape[1])//2, (desired_shape[1]-current_shape[1])//2 )), 'reflect')230 else: # (desired_shape[1]-current_shape[1]) % 2 == 1:231 images[i] = np.pad(images[i], ((0, 0), ( (desired_shape[1]-current_shape[1])//2, (desired_shape[1]-current_shape[1])//2 + 1)), 'reflect')232 elif current_shape[1] > desired_shape[1]:233 if (current_shape[1]-desired_shape[1]) % 2 == 0:234 images[i] = images[i][:, (current_shape[1]-desired_shape[1])//2 : -((current_shape[1]-desired_shape[1])//2)]235 else: # (current_shape[1]-desired_shape[1]) % 2 == 1:236 images[i] = images[i][:, (current_shape[1]-desired_shape[1])//2: -((current_shape[1]-desired_shape[1])//2 + 1)]237 238 else:239 desired_shape_boolean = True240 desired_shape_booleans.append(desired_shape_boolean)241 if False in desired_shape_booleans:242 num_resized += 1243 244 # Check if each galaxy has been correctly resized245 if False in desired_shape_booleans:246 correctly_resized_booleans = []247 for i in range(num_filters):248 current_shape = list(images[i].shape)249 if not (current_shape[0] == desired_shape[0] and current_shape[1] == desired_shape[1]):250 correctly_resized_boolean = False251 else:252 correctly_resized_boolean = True253 correctly_resized_booleans.append(correctly_resized_boolean)254 if False not in correctly_resized_booleans:255 num_correctly_resized += 1256 257 # Otherwise, let's proceed258 if num_psf_train < psf_train:259 for i in range(num_filters):260 # Save the image261 image_name = psf_train_folders[i] + str(obj_id) + '-' + filter_strings[i] + '.fits'262 hdu = fits.PrimaryHDU(images[i])263 hdu.writeto(image_name, overwrite=True)264 # Also, create a row for this image in the new catalog265 current_row = hsc_catalogs[i].iloc[row_num-2]266 if (source == "gal_sim_0_0.25") or (source == "gal_sim_0.25_0.5") or (source == "gal_sim_0.5_0.75") or (source == "gal_sim_0.5_1.0"):267 psf_train_catalogs[i] = psf_train_catalogs[i].append({'object_id': obj_id, 'num_components': current_row['num_components'],268 'sersic_idx_d': current_row['sersic_idx_d'],269 'R_e_d': current_row['R_e_d'],270 'axis_ratio_d': current_row['axis_ratio_d'],271 'PA_d': current_row['PA_d'],272 'flux_frac_d': current_row['flux_frac_d'],273 'sersic_idx_b': current_row['sersic_idx_b'],274 'R_e_b': current_row['R_e_b'],275 'axis_ratio_b': current_row['axis_ratio_b'],276 'PA_b': current_row['PA_b'],277 'flux_frac_b': current_row['flux_frac_b'],278 filter_strings[i] + '_total_flux': current_row['total_flux']}, ignore_index=True)279 elif source == "simard":280 psf_train_catalogs[i] = psf_train_catalogs[i].append({'object_id': obj_id,281 'ra': current_row['ra'],282 'dec': current_row['dec'],283 'photoz_best': current_row['photoz_best'],284 'SClass': current_row['SClass'],285 'z': current_row['z'],286 'Scale': current_row['Scale'],287 'Rhlg': current_row['Rhlg'],288 'Rhlr': current_row['Rhlr'],289 'Rchl,g': current_row['Rchl,g'],290 'Rchl,r': current_row['Rchl,r'],291 '(B/T)g': current_row['(B/T)g'],292 'e(B/T)g': current_row['e(B/T)g'],293 '(B/T)r': current_row['(B/T)r'],294 'e(B/T)r': current_row['e(B/T)r'],295 filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True)296 elif (source == "dimauro_0_0.5") or (source == "dimauro_0.5_0.75") or (source == "dimauro_0.5_1.0"):297 psf_train_catalogs[i] = psf_train_catalogs[i].append({'object_id': obj_id,298 'ra': current_row['ra'],299 'dec': current_row['dec'],300 'photoz_best': current_row['photoz_best'],301 'RE_F606': current_row['RE_F606'],302 'RE_F814': current_row['RE_F814'],303 'RE_F125': current_row['RE_F125'],304 'RE_F160': current_row['RE_F160'],305 'N_F606': current_row['N_F606'],306 'N_F814': current_row['N_F814'],307 'N_F125': current_row['N_F125'],308 'N_F160': current_row['N_F160'],309 'B_T_m': current_row['B_T_m'],310 filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True)311 num_psf_train += 1312 num_total += 1313 continue314 if num_psf_eval < psf_eval:315 for i in range(num_filters):316 # Save the image317 image_name = psf_eval_folders[i] + str(obj_id) + '-' + filter_strings[i] + '.fits'318 hdu = fits.PrimaryHDU(images[i])319 hdu.writeto(image_name, overwrite=True)320 # Also, create a row for this image in the new catalog321 current_row = hsc_catalogs[i].iloc[row_num-2]322 if (source == "gal_sim_0_0.25") or (source == "gal_sim_0.25_0.5") or (source == "gal_sim_0.5_0.75") or (source == "gal_sim_0.5_1.0"):323 psf_eval_catalogs[i] = psf_eval_catalogs[i].append({'object_id': obj_id, 'num_components': current_row['num_components'],324 'sersic_idx_d': current_row['sersic_idx_d'],325 'R_e_d': current_row['R_e_d'],326 'axis_ratio_d': current_row['axis_ratio_d'],327 'PA_d': current_row['PA_d'],328 'flux_frac_d': current_row['flux_frac_d'],329 'sersic_idx_b': current_row['sersic_idx_b'],330 'R_e_b': current_row['R_e_b'],331 'axis_ratio_b': current_row['axis_ratio_b'],332 'PA_b': current_row['PA_b'],333 'flux_frac_b': current_row['flux_frac_b'],334 filter_strings[i] + '_total_flux': current_row['total_flux']}, ignore_index=True)335 elif source == "simard":336 psf_eval_catalogs[i] = psf_eval_catalogs[i].append({'object_id': obj_id,337 'ra': current_row['ra'],338 'dec': current_row['dec'],339 'photoz_best': current_row['photoz_best'],340 'SClass': current_row['SClass'],341 'z': current_row['z'],342 'Scale': current_row['Scale'],343 'Rhlg': current_row['Rhlg'],344 'Rhlr': current_row['Rhlr'],345 'Rchl,g': current_row['Rchl,g'],346 'Rchl,r': current_row['Rchl,r'],347 '(B/T)g': current_row['(B/T)g'],348 'e(B/T)g': current_row['e(B/T)g'],349 '(B/T)r': current_row['(B/T)r'],350 'e(B/T)r': current_row['e(B/T)r'],351 filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True)352 elif (source == "dimauro_0_0.5") or (source == "dimauro_0.5_0.75") or (source == "dimauro_0.5_1.0"):353 psf_eval_catalogs[i] = psf_eval_catalogs[i].append({'object_id': obj_id,354 'ra': current_row['ra'],355 'dec': current_row['dec'],356 'photoz_best': current_row['photoz_best'],357 'RE_F606': current_row['RE_F606'],358 'RE_F814': current_row['RE_F814'],359 'RE_F125': current_row['RE_F125'],360 'RE_F160': current_row['RE_F160'],361 'N_F606': current_row['N_F606'],362 'N_F814': current_row['N_F814'],363 'N_F125': current_row['N_F125'],364 'N_F160': current_row['N_F160'],365 'B_T_m': current_row['B_T_m'],366 filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True)367 num_psf_eval += 1368 num_total += 1369 continue370 if num_gmn_train < gmn_train:371 for i in range(num_filters):372 # Save the image373 image_name = gmn_train_folders[i] + str(obj_id) + '-' + filter_strings[i] + '.fits'374 hdu = fits.PrimaryHDU(images[i])375 hdu.writeto(image_name, overwrite=True)376 # Also, create a row for this image in the new catalog377 current_row = hsc_catalogs[i].iloc[row_num-2]378 if (source == "gal_sim_0_0.25") or (source == "gal_sim_0.25_0.5") or (source == "gal_sim_0.5_0.75") or (source == "gal_sim_0.5_1.0"):379 gmn_train_catalogs[i] = gmn_train_catalogs[i].append({'object_id': obj_id, 'num_components': current_row['num_components'],380 'sersic_idx_d': current_row['sersic_idx_d'],381 'R_e_d': current_row['R_e_d'],382 'axis_ratio_d': current_row['axis_ratio_d'],383 'PA_d': current_row['PA_d'],384 'flux_frac_d': current_row['flux_frac_d'],385 'sersic_idx_b': current_row['sersic_idx_b'],386 'R_e_b': current_row['R_e_b'],387 'axis_ratio_b': current_row['axis_ratio_b'],388 'PA_b': current_row['PA_b'],389 'flux_frac_b': current_row['flux_frac_b'],390 filter_strings[i] + '_total_flux': current_row['total_flux']}, ignore_index=True)391 elif source == "simard":392 gmn_train_catalogs[i] = gmn_train_catalogs[i].append({'object_id': obj_id,393 'ra': current_row['ra'],394 'dec': current_row['dec'],395 'photoz_best': current_row['photoz_best'],396 'SClass': current_row['SClass'],397 'z': current_row['z'],398 'Scale': current_row['Scale'],399 'Rhlg': current_row['Rhlg'],400 'Rhlr': current_row['Rhlr'],401 'Rchl,g': current_row['Rchl,g'],402 'Rchl,r': current_row['Rchl,r'],403 '(B/T)g': current_row['(B/T)g'],404 'e(B/T)g': current_row['e(B/T)g'],405 '(B/T)r': current_row['(B/T)r'],406 'e(B/T)r': current_row['e(B/T)r'],407 filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True) 408 elif (source == "dimauro_0_0.5") or (source == "dimauro_0.5_0.75") or (source == "dimauro_0.5_1.0"):409 gmn_train_catalogs[i] = gmn_train_catalogs[i].append({'object_id': obj_id,410 'ra': current_row['ra'],411 'dec': current_row['dec'],412 'photoz_best': current_row['photoz_best'],413 'RE_F606': current_row['RE_F606'],414 'RE_F814': current_row['RE_F814'],415 'RE_F125': current_row['RE_F125'],416 'RE_F160': current_row['RE_F160'],417 'N_F606': current_row['N_F606'],418 'N_F814': current_row['N_F814'],419 'N_F125': current_row['N_F125'],420 'N_F160': current_row['N_F160'],421 'B_T_m': current_row['B_T_m'],422 filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True) 423 num_gmn_train += 1424 num_total += 1425 continue426 if num_gmn_eval < gmn_eval:427 for i in range(num_filters):428 # Save the image429 image_name = gmn_eval_folders[i] + str(obj_id) + '-' + filter_strings[i] + '.fits'430 hdu = fits.PrimaryHDU(images[i])431 hdu.writeto(image_name, overwrite=True)432 # Also, create a row for this image in the new catalog433 current_row = hsc_catalogs[i].iloc[row_num-2]434 if (source == "gal_sim_0_0.25") or (source == "gal_sim_0.25_0.5") or (source == "gal_sim_0.5_0.75") or (source == "gal_sim_0.5_1.0"):435 gmn_eval_catalogs[i] = gmn_eval_catalogs[i].append({'object_id': obj_id, 'num_components': current_row['num_components'],436 'sersic_idx_d': current_row['sersic_idx_d'],437 'R_e_d': current_row['R_e_d'],438 'axis_ratio_d': current_row['axis_ratio_d'],439 'PA_d': current_row['PA_d'],440 'flux_frac_d': current_row['flux_frac_d'],441 'sersic_idx_b': current_row['sersic_idx_b'],442 'R_e_b': current_row['R_e_b'],443 'axis_ratio_b': current_row['axis_ratio_b'],444 'PA_b': current_row['PA_b'],445 'flux_frac_b': current_row['flux_frac_b'],446 filter_strings[i] + '_total_flux': current_row['total_flux']}, ignore_index=True)447 elif source == "simard":448 gmn_eval_catalogs[i] = gmn_eval_catalogs[i].append({'object_id': obj_id,449 'ra': current_row['ra'],450 'dec': current_row['dec'],451 'photoz_best': current_row['photoz_best'],452 'SClass': current_row['SClass'],453 'z': current_row['z'],454 'Scale': current_row['Scale'],455 'Rhlg': current_row['Rhlg'],456 'Rhlr': current_row['Rhlr'],457 'Rchl,g': current_row['Rchl,g'],458 'Rchl,r': current_row['Rchl,r'],459 '(B/T)g': current_row['(B/T)g'],460 'e(B/T)g': current_row['e(B/T)g'],461 '(B/T)r': current_row['(B/T)r'],462 'e(B/T)r': current_row['e(B/T)r'],463 filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True) 464 elif (source == "dimauro_0_0.5") or (source == "dimauro_0.5_0.75") or (source == "dimauro_0.5_1.0"):465 gmn_eval_catalogs[i] = gmn_eval_catalogs[i].append({'object_id': obj_id,466 'ra': current_row['ra'],467 'dec': current_row['dec'],468 'photoz_best': current_row['photoz_best'],469 'RE_F606': current_row['RE_F606'],470 'RE_F814': current_row['RE_F814'],471 'RE_F125': current_row['RE_F125'],472 'RE_F160': current_row['RE_F160'],473 'N_F606': current_row['N_F606'],474 'N_F814': current_row['N_F814'],475 'N_F125': current_row['N_F125'],476 'N_F160': current_row['N_F160'],477 'B_T_m': current_row['B_T_m'],478 filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True)479 num_gmn_eval += 1480 num_total += 1481 continue482 if num_test < test:483 for i in range(num_filters):484 # Save the image485 image_name = test_folders[i] + str(obj_id) + '-' + filter_strings[i] + '.fits'486 hdu = fits.PrimaryHDU(images[i])487 hdu.writeto(image_name, overwrite=True)488 # Also, create a row for this image in the new catalog489 current_row = hsc_catalogs[i].iloc[row_num-2]490 if (source == "gal_sim_0_0.25") or (source == "gal_sim_0.25_0.5") or (source == "gal_sim_0.5_0.75") or (source == "gal_sim_0.5_1.0"):491 test_catalogs[i] = test_catalogs[i].append({'object_id': obj_id, 'num_components': current_row['num_components'],492 'sersic_idx_d': current_row['sersic_idx_d'],493 'R_e_d': current_row['R_e_d'],494 'axis_ratio_d': current_row['axis_ratio_d'],495 'PA_d': current_row['PA_d'],496 'flux_frac_d': current_row['flux_frac_d'],497 'sersic_idx_b': current_row['sersic_idx_b'],498 'R_e_b': current_row['R_e_b'],499 'axis_ratio_b': current_row['axis_ratio_b'],500 'PA_b': current_row['PA_b'],501 'flux_frac_b': current_row['flux_frac_b'],502 filter_strings[i] + '_total_flux': current_row['total_flux']}, ignore_index=True)503 elif source == "simard":504 test_catalogs[i] = test_catalogs[i].append({'object_id': obj_id,505 'ra': current_row['ra'],506 'dec': current_row['dec'],507 'photoz_best': current_row['photoz_best'],508 'SClass': current_row['SClass'],509 'z': current_row['z'],510 'Scale': current_row['Scale'],511 'Rhlg': current_row['Rhlg'],512 'Rhlr': current_row['Rhlr'],513 'Rchl,g': current_row['Rchl,g'],514 'Rchl,r': current_row['Rchl,r'],515 '(B/T)g': current_row['(B/T)g'],516 'e(B/T)g': current_row['e(B/T)g'],517 '(B/T)r': current_row['(B/T)r'],518 'e(B/T)r': current_row['e(B/T)r'],519 filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True)520 elif (source == "dimauro_0_0.5") or (source == "dimauro_0.5_0.75") or (source == "dimauro_0.5_1.0"):521 test_catalogs[i] = test_catalogs[i].append({'object_id': obj_id,522 'ra': current_row['ra'],523 'dec': current_row['dec'],524 'photoz_best': current_row['photoz_best'],525 'RE_F606': current_row['RE_F606'],526 'RE_F814': current_row['RE_F814'],527 'RE_F125': current_row['RE_F125'],528 'RE_F160': current_row['RE_F160'],529 'N_F606': current_row['N_F606'],530 'N_F814': current_row['N_F814'],531 'N_F125': current_row['N_F125'],532 'N_F160': current_row['N_F160'],533 'B_T_m': current_row['B_T_m'],534 filter_strings[i] + '_total_flux': (current_row[filter_strings[i] + '_cmodel_flux'])*nJy_to_adu_per_AA_filters[i]}, ignore_index=True)535 num_test += 1536 num_total += 1537 continue538 # At last, save the catalogs539 for i in range(num_filters):540 galaxy_per_filter = galaxy_main + filter_strings[i] + '-band/'541 gmn_train_catalogs[i].to_csv(galaxy_per_filter + 'gmn_train.csv', index=False)542 gmn_eval_catalogs[i].to_csv(galaxy_per_filter + 'gmn_eval.csv', index=False)543 psf_train_catalogs[i].to_csv(galaxy_per_filter + 'catalog_train.csv', index=False)544 psf_eval_catalogs[i].to_csv(galaxy_per_filter + 'catalog_eval.csv', index=False)545 test_catalogs[i].to_csv(galaxy_per_filter + 'catalog_test.csv', index=False)546 # Print out how many galaxies are selected547 print(str(num_total) + ' galaxies are selected in total:')548 print(str(num_gmn_train) + ' galaxies in train set for GaMorNet')549 print(str(num_gmn_eval) + ' galaxies in eval set for GaMorNet')550 print(str(num_psf_train) + ' galaxies in train set for PSFGAN')551 print(str(num_psf_eval) + ' galaxies in eval set for PSFGAN')552 print(str(num_test) + ' galaxies in common test set')553 print(str(num_resized) + ' galaxies have been resized for having different initial sizes')554 print(str(num_correctly_resized) + ' galaxies have been CORRECTLY resized')555 print(str(num_negative_flux) + ' galaxies are discarded for having negative flux(es) in at least one filter')556if __name__ == '__main__':...
log_reader.py
Source:log_reader.py
1# Standard Library2import base643import itertools4import json5from dataclasses import asdict, dataclass6from typing import Iterable, List, Optional7# Sematic8from sematic import storage9from sematic.abstract_future import FutureState10from sematic.db.models.resolution import Resolution, ResolutionKind, ResolutionStatus11from sematic.db.queries import get_resolution, get_run12from sematic.resolvers.cloud_resolver import (13 END_INLINE_RUN_INDICATOR,14 START_INLINE_RUN_INDICATOR,15)16from sematic.scheduling.external_job import JobType17# Why the "V1"? Because we will likely want to change the structure of18# the logs such that each file contains a different subset of logs. But19# when we make this change, we will still want logs written in the old20# structure to be readable, at least for a while. So we need to identify21# which structure the files are in somehow, and a v1/v2 prefix is how we22# can do it.23V1_LOG_PATH_FORMAT = "logs/v1/run_id/{run_id}/{log_kind}/"24def log_prefix(run_id: str, job_type: JobType):25 return V1_LOG_PATH_FORMAT.format(run_id=run_id, log_kind=job_type.value)26@dataclass27class LogLineResult:28 """Results of a query for log lines29 Attributes30 ----------31 more_before:32 Are there more lines before the first line returned?33 more_after:34 Are there more lines after the last line returned? Will be True35 if the answer is known to be yes, False if the answer is known to36 be no. If the answer is not known (i.e. run may still be in37 progress), True will be returned.38 lines:39 The actual log lines40 continuation_cursor:41 A string that can be used to continue traversing these logs from where you left42 off. If more_after is False, this will be set to None.43 log_unavailable_reason:44 A human-readable reason why logs are not available.45 """46 more_before: bool47 more_after: bool48 lines: List[str]49 continuation_cursor: Optional[str]50 log_unavailable_reason: Optional[str] = None51@dataclass52class Cursor:53 """A cursor representing a particular place in the process of traversing logs.54 Attributes55 ----------56 source_log_key:57 The storage key for the log that was being used when the search left off. If no58 logs have been found yet, will be None59 source_file_line_index:60 The line number BEFORE filters are applied within the log file being read.61 It will be the first line that HASN'T yet been read. If no logs have been found,62 will be -1.63 filter_strings:64 The fillter strings that were used for this log traversal.65 run_id:66 The run id that was being used for this log traversal.67 """68 # Why include source log file? Because we will soon likely want to break up69 # the logs for a single run such that each file contains a *different*70 # portion of the logs, and we will need to know which file to go to in71 # order to pick back up. The alternative would be to require72 # re-traversing already traversed files when continuing.73 source_log_key: Optional[str]74 source_file_line_index: int75 filter_strings: List[str]76 run_id: str77 def to_token(self) -> str:78 return str(79 base64.b64encode(json.dumps(asdict(self)).encode("utf8")), encoding="utf8"80 )81 @classmethod82 def from_token(cls, token: str):83 kwargs = json.loads(84 base64.b64decode(bytes(token, encoding="utf8")).decode("utf8")85 )86 return Cursor(**kwargs)87 @classmethod88 def nothing_found(cls, filter_strings: List[str], run_id: str):89 return Cursor(90 source_log_key=None,91 source_file_line_index=-1,92 filter_strings=filter_strings,93 run_id=run_id,94 )95@dataclass96class LogLine:97 source_file: str98 source_file_index: int99 line: str100def load_log_lines(101 run_id: str,102 continuation_cursor: Optional[str],103 max_lines: int,104 filter_strings: Optional[List[str]] = None,105) -> LogLineResult:106 """Load a portion of the logs for a particular run107 Parameters108 ----------109 run_id:110 The id of the run to get logs for111 continuation_cursor:112 A cursor indicating where to continue reading logs from. Should be113 None if the logs are being read from the beginning.114 max_lines:115 The highest number of log lines that should be returned at once116 filter_strings:117 Only log lines that contain ALL of the strings in this list will118 be included in the result119 Returns120 -------121 A subset of the logs for the given run122 """123 run = get_run(run_id)124 run_state = FutureState[run.future_state] # type: ignore125 still_running = not (run_state.is_terminal() or run_state == FutureState.RAN)126 resolution = get_resolution(run.root_id)127 filter_strings = filter_strings if filter_strings is not None else []128 cursor = (129 Cursor.from_token(continuation_cursor)130 if continuation_cursor is not None131 else Cursor.nothing_found(filter_strings, run_id)132 )133 if cursor.run_id != run_id:134 raise ValueError(135 f"Tried to continue a log search of {run_id} using a "136 f"continuation cursor from {cursor.run_id}"137 )138 if set(cursor.filter_strings) != set(filter_strings):139 raise ValueError(140 f"Tried to continue a log search of {run_id} using a "141 f"different set of filters than were used in the cursor."142 )143 if ResolutionStatus[resolution.status] in ( # type: ignore144 ResolutionStatus.CREATED,145 ResolutionStatus.SCHEDULED,146 ):147 return LogLineResult(148 more_before=False,149 more_after=True,150 lines=[],151 continuation_cursor=cursor.to_token(),152 log_unavailable_reason="Resolution has not started yet.",153 )154 filter_strings = filter_strings if filter_strings is not None else []155 if FutureState[run.future_state] == FutureState.CREATED: # type: ignore156 return LogLineResult(157 more_before=False,158 more_after=True,159 lines=[],160 continuation_cursor=cursor.to_token(),161 log_unavailable_reason="The run has not yet started executing.",162 )163 # looking for external jobs to determine inline is only valid164 # since we know the run has at least reached SCHEDULED due to it165 # not being CREATED.166 is_inline = len(run.external_jobs) == 0167 if is_inline:168 return _load_inline_logs(169 run_id=run_id,170 resolution=resolution,171 still_running=still_running,172 cursor_file=cursor.source_log_key,173 cursor_line_index=cursor.source_file_line_index,174 max_lines=max_lines,175 filter_strings=filter_strings,176 )177 return _load_non_inline_logs(178 run_id=run_id,179 still_running=still_running,180 cursor_file=cursor.source_log_key,181 cursor_line_index=cursor.source_file_line_index,182 max_lines=max_lines,183 filter_strings=filter_strings,184 )185def _get_latest_log_file(prefix, cursor_file) -> Optional[str]:186 # recall that for v1 logs, each log file contains ALL the logs from187 # the beginning of the run until the time that file was uploaded. So188 # the latest log file contains all the logs we have for the run.189 log_files = storage.get_child_paths(prefix)190 if len(log_files) < 1:191 return None192 # the file wth the highest timestamp has the full logs.193 if cursor_file is not None and cursor_file not in log_files:194 raise RuntimeError(195 f"Trying to continue a log traversal from {cursor_file}, but "196 f"that file doesn't exist."197 )198 latest_log_file = max(199 log_files,200 key=lambda path_key: int(201 path_key.replace(prefix, "").replace(".log", "".replace("/", ""))202 ),203 )204 return latest_log_file205def _load_non_inline_logs(206 run_id: str,207 still_running: bool,208 cursor_file: Optional[str],209 cursor_line_index: int,210 max_lines: int,211 filter_strings: List[str],212) -> LogLineResult:213 """Load the lines for runs that are NOT inline"""214 prefix = log_prefix(run_id, JobType.worker)215 latest_log_file = _get_latest_log_file(prefix, cursor_file)216 if latest_log_file is None:217 return LogLineResult(218 more_before=False,219 more_after=still_running,220 lines=[],221 continuation_cursor=Cursor.nothing_found(filter_strings, run_id).to_token()222 if still_running223 else None,224 log_unavailable_reason="No log files found",225 )226 text_stream = storage.get_line_stream(latest_log_file)227 line_stream = (228 LogLine(source_file=latest_log_file, source_file_index=i, line=ln)229 for i, ln in zip(itertools.count(), text_stream)230 )231 return get_log_lines_from_line_stream(232 line_stream=line_stream,233 still_running=still_running,234 cursor_source_file=latest_log_file,235 cursor_line_index=cursor_line_index,236 max_lines=max_lines,237 filter_strings=filter_strings,238 run_id=run_id,239 )240def _load_inline_logs(241 run_id: str,242 resolution: Resolution,243 still_running: bool,244 cursor_file: Optional[str],245 cursor_line_index: int,246 max_lines: int,247 filter_strings: List[str],248) -> LogLineResult:249 """Load the lines for runs that are NOT inline"""250 if ResolutionKind[resolution.kind] == ResolutionKind.LOCAL: # type: ignore251 return LogLineResult(252 more_before=False,253 more_after=False,254 lines=[],255 continuation_cursor=None,256 log_unavailable_reason=(257 "UI logs are only available for runs that "258 "(a) are executed using the CloudResolver and "259 "(b) are using the resolver in non-detached mode OR have inline=False."260 ),261 )262 prefix = log_prefix(resolution.root_id, JobType.driver)263 latest_log_file = _get_latest_log_file(prefix, cursor_file)264 if latest_log_file is None:265 return LogLineResult(266 more_before=False,267 more_after=still_running,268 continuation_cursor=Cursor.nothing_found(filter_strings, run_id).to_token()269 if still_running270 else None,271 lines=[],272 log_unavailable_reason="Resolver logs are missing",273 )274 text_stream: Iterable[str] = storage.get_line_stream(latest_log_file)275 line_stream = _filter_for_inline(text_stream, run_id, latest_log_file)276 return get_log_lines_from_line_stream(277 line_stream=line_stream,278 still_running=still_running,279 cursor_source_file=cursor_file,280 cursor_line_index=cursor_line_index,281 max_lines=max_lines,282 filter_strings=filter_strings,283 run_id=run_id,284 )285def _filter_for_inline(286 text_stream: Iterable[str], run_id: str, source_file: str287) -> Iterable[LogLine]:288 """Stream resolver logs to make a new stream with only lines for a particular run"""289 expected_start = START_INLINE_RUN_INDICATOR.format(run_id)290 expected_end = END_INLINE_RUN_INDICATOR.format(run_id)291 buffer_iterator = iter(text_stream)292 found_start = False293 file_line_index = 0294 while True:295 try:296 line = next(buffer_iterator)297 except StopIteration:298 # if a resolver dies mid-execution of an inline run,299 # we should treat the end of the existing lines as300 # the end of whatever inline we were looking for.301 break302 if expected_start in line:303 found_start = True304 continue305 if not found_start:306 continue307 if expected_end in line:308 break309 yield LogLine(310 source_file=source_file, source_file_index=file_line_index, line=line311 )312 file_line_index += 1313def get_log_lines_from_line_stream(314 line_stream: Iterable[LogLine],315 still_running: bool,316 cursor_source_file: Optional[str],317 cursor_line_index: int,318 max_lines: int,319 filter_strings: List[str],320 run_id: str,321) -> LogLineResult:322 """Given a stream of log lines, produce an object containing the desired subset323 Parameters324 ----------325 line_stream:326 An iterable stream of log lines327 still_running:328 A boolean indicating whether the run these logs are for is still running or not329 cursor_source_file:330 The source file to continue from. No lines should be returned until this file is331 reached.332 cursor_line_index:333 The source file to continue from. No lines should be returned until this source334 file index is reached.335 max_lines:336 The maximum number of lines that should be returned337 filter_strings:338 A list of strings to filter log lines by. Only log lines that contain ALL of the339 filters will be returned.340 run_id:341 The id of the run the traversal is for.342 Returns343 -------344 A subset of the logs for the given run345 """346 buffer_iterator = iter(line_stream)347 keep_going = True348 lines = []349 has_more = True350 more_before = False351 source_file = None352 source_file_line_index = -1353 found_cursor = False354 def passes_filter(line: LogLine) -> bool:355 return all(substring in line.line for substring in filter_strings)356 while keep_going:357 try:358 line = next(ln for ln in buffer_iterator)359 source_file = line.source_file360 source_file_line_index = line.source_file_index361 if not found_cursor:362 if (363 cursor_source_file is None364 or source_file == cursor_source_file365 and source_file_line_index >= cursor_line_index366 ):367 found_cursor = True368 else:369 more_before = more_before or passes_filter(line)370 continue371 if not passes_filter(line):372 continue373 lines.append(line.line)374 if len(lines) >= max_lines:375 has_more = True376 keep_going = False377 except StopIteration:378 keep_going = False379 # hit the end of the logs produced so far. If the run is380 # done, there are no more logs. Otherwise more might show381 # up!382 has_more = still_running383 missing_reason = None if len(lines) > 0 else "No matching log lines."384 return LogLineResult(385 more_before=more_before,386 more_after=has_more,387 lines=lines,388 continuation_cursor=Cursor(389 source_log_key=source_file,390 # +1: next time we want to start AFTER where we last read391 source_file_line_index=source_file_line_index + 1,392 filter_strings=filter_strings,393 run_id=run_id,394 ).to_token()395 if has_more396 else None,397 log_unavailable_reason=missing_reason,...
heuristic_filtering.py
Source:heuristic_filtering.py
1import re2from typing import List, Union, Set3def process_label(label: str, lowercase: bool = True, stop_words: Set[str] = None) -> Union[List[str], None]:4 """Heuristically filter and process label(s)"""5 if not label:6 return None7 # Handle multi-labels8 label_delimiters_regex = re.compile('|'.join([';', '/']))9 labels = set(l.strip() for l in re.split(label_delimiters_regex, label))10 filter_strings = ['section', 'etc', 'now', 'whereas', 'exhibit ',11 'therefore', 'article', 'in witness whereof', 'schedule', 'article']12 filtered_labels = set([])13 for label in labels:14 if len(label) < 3 or len(label) > 75 or \15 not label[0].isupper() or \16 any(bw for bw in filter_strings if label.lower().startswith(bw)):17 continue18 if label[-1] in ['.', ':']: # remove scraping artifacts19 label = label[:-1]20 label = re.sub('[ \t]+', ' ', label.replace('\n', ' ').strip())21 if label:22 if stop_words:23 if label.lower() in stop_words:24 continue25 label_words = label.split(' ')26 if len(label_words) > 1:27 if len(label_words[-1]) > 1 and label_words[-1].lower() in stop_words:28 continue29 if (label_words[0].lower() in stop_words or label_words[0].lower() in {'without', 'due'}) and \30 label_words[0].lower() not in {'other', 'further', 'no', 'not', 'own', 'off'}:31 continue32 label = label.lower() if lowercase else label33 filtered_labels.add(label)34 return list(filtered_labels)35def process_text(text: str) -> Union[str, None]:36 """Heuristically filter and process provision text"""37 text = text.strip()38 filter_strings = ["â means", '" means', 'shall mean', "' means", 'â means'39 'shall have the meaning', 'has the meaning', 'have meaning']40 if len(text) < 25 or \41 text[0].islower() or \42 text[0] in ['"', 'â'] or \43 any(bw for bw in filter_strings if bw in text):44 return None45 text = text.strip()46 if text[0] in ['.', ':']:47 text = text[1:].strip()48 if not text[0].isupper() and not text[0] in ['(', '[']:49 return None50 if not text[-1] == '.':51 return None52 text = re.sub('[ \t]+', ' ', text.replace('\n', ' ').strip())...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!