Best Python code snippet using behave
hadoop.py
Source:hadoop.py
1# Copyright 2009-2012 Yelp and Contributors2#3# Licensed under the Apache License, Version 2.0 (the "License");4# you may not use this file except in compliance with the License.5# You may obtain a copy of the License at6#7# http://www.apache.org/licenses/LICENSE-2.08#9# Unless required by applicable law or agreed to in writing, software10# distributed under the License is distributed on an "AS IS" BASIS,11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12# See the License for the specific language governing permissions and13# limitations under the License.14import logging15import posixpath16import re17from subprocess import Popen18from subprocess import PIPE19from subprocess import CalledProcessError20try:21 from cStringIO import StringIO22 StringIO # quiet "redefinition of unused ..." warning from pyflakes23except ImportError:24 from StringIO import StringIO25from mrjob.fs.base import Filesystem26from mrjob.parse import is_uri27from mrjob.parse import urlparse28from mrjob.util import cmd_line29from mrjob.util import read_file30log = logging.getLogger('mrjob.fs.hadoop')31# used by mkdir()32HADOOP_FILE_EXISTS_RE = re.compile(r'.*File exists.*')33# used by ls()34HADOOP_LSR_NO_SUCH_FILE = re.compile(35 r'^lsr: Cannot access .*: No such file or directory.')36# used by rm() (see below)37HADOOP_RMR_NO_SUCH_FILE = re.compile(r'^rmr: hdfs://.*$')38class HadoopFilesystem(Filesystem):39 """Filesystem for URIs accepted by ``hadoop fs``. Typically you will get40 one of these via ``HadoopJobRunner().fs``, composed with41 :py:class:`~mrjob.fs.local.LocalFilesystem`.42 """43 def __init__(self, hadoop_bin):44 """:param hadoop_bin: path to ``hadoop`` binary"""45 super(HadoopFilesystem, self).__init__()46 self._hadoop_bin = hadoop_bin47 def can_handle_path(self, path):48 return is_uri(path)49 def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None,50 return_stdout=False):51 """Run the given hadoop command, raising an exception on non-zero52 return code. This only works for commands whose output we don't53 care about.54 Args:55 ok_returncodes -- a list/tuple/set of return codes we expect to56 get back from hadoop (e.g. [0,1]). By default, we only expect 0.57 If we get an unexpected return code, we raise a CalledProcessError.58 ok_stderr -- don't log STDERR or raise CalledProcessError if stderr59 matches a regex in this list (even if the returncode is bad)60 return_stdout -- return the stdout from the hadoop command rather61 than logging it. If this is False, we return the returncode62 instead.63 """64 args = self._hadoop_bin + args65 log.debug('> %s' % cmd_line(args))66 proc = Popen(args, stdout=PIPE, stderr=PIPE)67 stdout, stderr = proc.communicate()68 log_func = log.debug if proc.returncode == 0 else log.error69 if not return_stdout:70 for line in StringIO(stdout):71 log_func('STDOUT: ' + line.rstrip('\r\n'))72 # check if STDERR is okay73 stderr_is_ok = False74 if ok_stderr:75 for stderr_re in ok_stderr:76 if stderr_re.match(stderr):77 stderr_is_ok = True78 break79 if not stderr_is_ok:80 for line in StringIO(stderr):81 log_func('STDERR: ' + line.rstrip('\r\n'))82 ok_returncodes = ok_returncodes or [0]83 if not stderr_is_ok and proc.returncode not in ok_returncodes:84 raise CalledProcessError(proc.returncode, args)85 if return_stdout:86 return stdout87 else:88 return proc.returncode89 def du(self, path_glob):90 """Get the size of a file, or None if it's not a file or doesn't91 exist."""92 try:93 stdout = self.invoke_hadoop(['fs', '-dus', path_glob],94 return_stdout=True)95 except CalledProcessError:96 raise IOError(path_glob)97 try:98 return sum(int(line.split()[1])99 for line in stdout.split('\n')100 if line.strip())101 except (ValueError, TypeError, IndexError):102 raise IOError(103 'Unexpected output from hadoop fs -du: %r' % stdout)104 def ls(self, path_glob):105 components = urlparse(path_glob)106 hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)107 try:108 stdout = self.invoke_hadoop(109 ['fs', '-lsr', path_glob],110 return_stdout=True,111 ok_stderr=[HADOOP_LSR_NO_SUCH_FILE])112 except CalledProcessError:113 raise IOError("Could not ls %s" % path_glob)114 for line in StringIO(stdout):115 line = line.rstrip('\r\n')116 fields = line.split(' ')117 # Throw out directories118 if fields[0].startswith('d'):119 continue120 # Try to figure out which part of the line is the path121 # Expected lines:122 # -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar # HDFS123 # -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar # S3124 path_index = None125 for index, field in enumerate(fields):126 if len(field) == 5 and field[2] == ':':127 path_index = (index + 1)128 if not path_index:129 raise IOError("Could not locate path in string '%s'" % line)130 path = line.split(' ', path_index)[-1]131 # handle fully qualified URIs from newer versions of Hadoop ls132 # (see Pull Request #577)133 if is_uri(path):134 yield path135 else:136 yield hdfs_prefix + path137 def _cat_file(self, filename):138 # stream from HDFS139 cat_args = self._hadoop_bin + ['fs', '-cat', filename]140 log.debug('> %s' % cmd_line(cat_args))141 cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)142 def stream():143 for line in cat_proc.stdout:144 yield line145 # there shouldn't be any stderr146 for line in cat_proc.stderr:147 log.error('STDERR: ' + line)148 returncode = cat_proc.wait()149 if returncode != 0:150 raise IOError("Could not stream %s" % filename)151 return read_file(filename, stream())152 def mkdir(self, path):153 try:154 self.invoke_hadoop(155 ['fs', '-mkdir', path], ok_stderr=[HADOOP_FILE_EXISTS_RE])156 except CalledProcessError:157 raise IOError("Could not mkdir %s" % path)158 def path_exists(self, path_glob):159 """Does the given path exist?160 If dest is a directory (ends with a "/"), we check if there are161 any files starting with that path.162 """163 try:164 return_code = self.invoke_hadoop(['fs', '-ls', path_glob],165 ok_returncodes=[0,-1,255])166 return (return_code == 0)167 except CalledProcessError:168 raise IOError("Could not check path %s" % path_glob)169 def path_join(self, dirname, filename):170 return posixpath.join(dirname, filename)171 def rm(self, path_glob):172 if not is_uri(path_glob):173 super(HadoopFilesystem, self).rm(path_glob)174 if self.path_exists(path_glob):175 # hadoop fs -rmr will print something like:176 # Moved to trash: hdfs://hdnamenode:54310/user/dave/asdf177 # to STDOUT, which we don't care about.178 #179 # if we ask to delete a path that doesn't exist, it prints180 # to STDERR something like:181 # rmr: <path>182 # which we can safely ignore183 try:184 self.invoke_hadoop(185 ['fs', '-rmr', path_glob],186 return_stdout=True, ok_stderr=[HADOOP_RMR_NO_SUCH_FILE])187 except CalledProcessError:188 raise IOError("Could not rm %s" % path_glob)189 def touchz(self, dest):190 try:191 self.invoke_hadoop(['fs', '-touchz', dest])192 except CalledProcessError:...
detector.py
Source:detector.py
1#2# Licensed under the Apache License, Version 2.0 (the "License");3# you may not use this file except in compliance with the License.4# You may obtain a copy of the License at5#6# http://www.apache.org/licenses/LICENSE-2.07#8# Unless required by applicable law or agreed to in writing, software9# distributed under the License is distributed on an "AS IS" BASIS,10# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.11# See the License for the specific language governing permissions and12# limitations under the License.13#14"""Detect Thermos tasks on disk15This module contains the TaskDetector, used to detect Thermos tasks within a given checkpoint root.16"""17import functools18import glob19import os20import re21from abc import abstractmethod22from twitter.common.lang import Compatibility, Interface23from apache.thermos.common.constants import DEFAULT_CHECKPOINT_ROOT24from apache.thermos.common.path import TaskPath25class PathDetector(Interface):26 @abstractmethod27 def get_paths(self):28 """Get a list of valid checkpoint roots."""29class FixedPathDetector(PathDetector):30 def __init__(self, path=DEFAULT_CHECKPOINT_ROOT):31 if not isinstance(path, Compatibility.string):32 raise TypeError('FixedPathDetector path should be a string, got %s' % type(path))33 self._paths = [path]34 def get_paths(self):35 return self._paths[:]36class ChainedPathDetector(PathDetector):37 def __init__(self, *detectors):38 for detector in detectors:39 if not isinstance(detector, PathDetector):40 raise TypeError('Expected detector %r to be a PathDetector, got %s' % (41 detector, type(detector)))42 self._detectors = detectors43 def get_paths(self):44 def iterate():45 for detector in self._detectors:46 for path in detector.get_paths():47 yield path48 return list(set(iterate()))49def memoized(fn):50 cache_attr_name = '__memoized_' + fn.__name__51 @functools.wraps(fn)52 def memoized_fn(self, *args):53 if not hasattr(self, cache_attr_name):54 setattr(self, cache_attr_name, {})55 cache = getattr(self, cache_attr_name)56 try:57 return cache[args]58 except KeyError:59 cache[args] = rv = fn(self, *args)60 return rv61 return memoized_fn62class TaskDetector(object):63 """64 Helper class in front of TaskPath to detect active/finished/running tasks. Performs no65 introspection on the state of a task; merely detects based on file paths on disk.66 """67 class Error(Exception): pass68 class MatchingError(Error): pass69 def __init__(self, root):70 self._root_dir = root71 self._pathspec = TaskPath()72 @memoized73 def __get_task_ids_patterns(self, state):74 path_glob = self._pathspec.given(75 root=self._root_dir,76 task_id="*",77 state=state or '*'78 ).getpath('task_path')79 path_regex = self._pathspec.given(80 root=re.escape(self._root_dir),81 task_id="(\S+)",82 state='(\S+)'83 ).getpath('task_path')84 return path_glob, re.compile(path_regex)85 def get_task_ids(self, state=None):86 path_glob, path_regex = self.__get_task_ids_patterns(state)87 for path in glob.glob(path_glob):88 try:89 task_state, task_id = path_regex.match(path).groups()90 except Exception:91 continue92 if state is None or task_state == state:93 yield (task_state, task_id)94 @memoized95 def __get_process_runs_patterns(self, task_id, log_dir):96 path_glob = self._pathspec.given(97 root=self._root_dir,98 task_id=task_id,99 log_dir=log_dir,100 process='*',101 run='*'102 ).getpath('process_logdir')103 path_regex = self._pathspec.given(104 root=re.escape(self._root_dir),105 task_id=re.escape(task_id),106 log_dir=log_dir,107 process='(\S+)',108 run='(\d+)'109 ).getpath('process_logdir')110 return path_glob, re.compile(path_regex)111 def get_process_runs(self, task_id, log_dir):112 path_glob, path_regex = self.__get_process_runs_patterns(task_id, log_dir)113 for path in glob.glob(path_glob):114 try:115 process, run = path_regex.match(path).groups()116 except Exception:117 continue118 yield process, int(run)119 def get_process_logs(self, task_id, log_dir):120 for process, run in self.get_process_runs(task_id, log_dir):121 for logtype in ('stdout', 'stderr'):122 path = (self._pathspec.with_filename(logtype).given(root=self._root_dir,123 task_id=task_id,124 log_dir=log_dir,125 process=process,126 run=run)127 .getpath('process_logdir'))128 if os.path.exists(path):129 yield path130 def get_checkpoint(self, task_id):131 return self._pathspec.given(root=self._root_dir, task_id=task_id).getpath('runner_checkpoint')132 @memoized133 def __get_process_checkpoints_patterns(self, task_id):134 path_glob = self._pathspec.given(135 root=self._root_dir,136 task_id=task_id,137 process='*'138 ).getpath('process_checkpoint')139 path_regex = self._pathspec.given(140 root=re.escape(self._root_dir),141 task_id=re.escape(task_id),142 process='(\S+)',143 ).getpath('process_checkpoint')144 return path_glob, re.compile(path_regex)145 def get_process_checkpoints(self, task_id):146 path_glob, path_regex = self.__get_process_checkpoints_patterns(task_id)147 for path in glob.glob(path_glob):148 try:149 process, = path_regex.match(path).groups()150 except Exception:151 continue...
scans_to_tfrecords.py
Source:scans_to_tfrecords.py
1import argparse2from pathlib import Path3import numpy as np4from scipy.ndimage import zoom5import tensorflow as tf6from tqdm import tqdm7import nrrd8from pydicom import dcmread9import nibabel as nib10def preprocess_scan(scan, downsample):11 "Apply some preprocessing to the image"12 if downsample != 1:13 z, y, x = scan.shape14 scan = zoom(scan, (48 / z, 256 / y, 256 / x), order=5)15 # scan = zoom(scan, 1 / downsample)16 scan = scan.astype(np.float32)17 return scan18def scan_to_example(scan):19 "Convert a scan (a NumPy array) to an Example class"20 z, y, x = scan.shape21 scan_raw = scan.tostring()22 scan_features = {23 "z": tf.train.Feature(int64_list=tf.train.Int64List(value=[z])),24 "y": tf.train.Feature(int64_list=tf.train.Int64List(value=[y])),25 "x": tf.train.Feature(int64_list=tf.train.Int64List(value=[x])),26 "scan_raw": tf.train.Feature(27 bytes_list=tf.train.BytesList(value=[scan_raw])28 ),29 }30 return tf.train.Example(features=tf.train.Features(feature=scan_features))31def split_into_subsequences(data, s):32 """33 Split the input sequence into sublist of size s.34 >>> s = "abcdefg"35 >>> split_into_subsequences(s, 2)36 ['ab', 'cd', 'ef', 'g']37 """38 return [data[x : x + s] for x in range(0, len(data), s)]39def save_scan(writer, scan):40 """Serialize the scan in a tfrecord file.41 If the scan is not e regular volume (the first axis42 has length 0) then return doing nothing.43 """44 if scan.shape[0] < 1:45 return46 example = scan_to_example(scan)47 writer.write(example.SerializeToString())48def convert_nrrd(path_glob, output_dir_name, downsample):49 nrrd_files = [str(f) for f in Path(".").glob(path_glob)]50 nrrd_files = split_into_subsequences(nrrd_files, 10)51 output_dir = Path(output_dir_name)52 output_dir.mkdir()53 for i, chunk in tqdm(54 enumerate(nrrd_files, start=1), total=len(nrrd_files)55 ):56 tfrecord_fname = str(output_dir / f"{i:02}.tfrecord")57 with tf.io.TFRecordWriter(tfrecord_fname) as writer:58 for fname in chunk:59 scan, _ = nrrd.read(fname, index_order="C")60 scan = preprocess_scan(scan, downsample)61 save_scan(writer, scan)62def convert_dicom(path_glob, output_dir_name, downsample):63 dcm_directories = list(Path(".").glob(path_glob))64 dcm_directories = split_into_subsequences(dcm_directories, 10)65 output_dir = Path(output_dir_name)66 output_dir.mkdir()67 for i, chunk in tqdm(68 enumerate(dcm_directories, start=1), total=len(dcm_directories)69 ):70 tfrecord_fname = str(output_dir / f"{i:02}.tfrecord")71 with tf.io.TFRecordWriter(tfrecord_fname) as writer:72 for dcm_dir in chunk:73 dcm_files = Path(dcm_dir).glob("*.dcm")74 dcm_slices = [dcmread(str(f)) for f in dcm_files]75 is_volume = (76 hasattr(dcm_slices[0], "SliceLocation")77 and len(dcm_slices) >= 478 )79 if is_volume:80 dcm_slices = sorted(81 dcm_slices, key=lambda x: x.SliceLocation82 )83 if all(84 s.pixel_array.shape == (512, 512) for s in dcm_slices85 ):86 scan = np.stack([s.pixel_array for s in dcm_slices])87 scan = preprocess_scan(scan, downsample)88 save_scan(writer, scan)89def convert_nifti(path_glob, output_dir_name, downsample):90 nifti_files = [str(f) for f in Path(".").glob(path_glob)]91 nifti_files = split_into_subsequences(nifti_files, 10)92 output_dir = Path(output_dir_name)93 output_dir.mkdir()94 for i, chunk in tqdm(95 enumerate(nifti_files, start=1), total=len(nifti_files)96 ):97 tfrecord_fname = str(output_dir / f"{i:02}.tfrecord")98 with tf.io.TFRecordWriter(tfrecord_fname) as writer:99 for fname in chunk:100 nib_obj = nib.load(fname)101 scan = nib_obj.get_fdata().T # transpose to obtain C order102 scan = preprocess_scan(scan, downsample)103 save_scan(writer, scan)104if __name__ == "__main__":105 import doctest106 doctest.testmod()107 parser = argparse.ArgumentParser(108 description="Convert multiple CT scans to a single tfrecord file"109 )110 parser.add_argument("file_type", choices=["nrrd", "dicom", "nifti"])111 parser.add_argument(112 "path_glob",113 help="Glob that identifies all the nrrd/dicom/nifti files to convert (must be inside quotes)",114 )115 parser.add_argument(116 "-d", "--downsample", type=float, default=1, help="Downscaling factor"117 )118 parser.add_argument(119 "output_dir",120 help="Name of the directory where to store the tfrecords files",121 )122 args = parser.parse_args()123 if args.file_type == "nrrd":124 convert_nrrd(args.path_glob, args.output_dir, args.downsample)125 if args.file_type == "dicom":126 convert_dicom(args.path_glob, args.output_dir, args.downsample)127 if args.file_type == "nifti":...
base.py
Source:base.py
1# Copyright 2009-2012 Yelp and Contributors2#3# Licensed under the Apache License, Version 2.0 (the "License");4# you may not use this file except in compliance with the License.5# You may obtain a copy of the License at6#7# http://www.apache.org/licenses/LICENSE-2.08#9# Unless required by applicable law or agreed to in writing, software10# distributed under the License is distributed on an "AS IS" BASIS,11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12# See the License for the specific language governing permissions and13# limitations under the License.14import logging15log = logging.getLogger('mrjob.fs')16class Filesystem(object):17 """Some simple filesystem operations that are common across the local18 filesystem, S3, HDFS, and remote machines via SSH. Different runners19 provide functionality for different filesystems via their20 :py:attr:`~mrjob.runner.MRJobRunner.fs` attribute. The ``hadoop`` and21 ``emr`` runners provide support for multiple protocols using22 :py:class:`~mrjob.job.composite.CompositeFilesystem`.23 Protocol support:24 * :py:class:`mrjob.fs.hadoop.HadoopFilesystem`: ``hdfs://``, others25 * :py:class:`mrjob.fs.local.LocalFilesystem`: ``/``26 * :py:class:`mrjob.fs.s3.S3Filesystem`: ``s3://bucket/path``,27 ``s3n://bucket/path``28 * :py:class:`mrjob.fs.ssh.SSHFilesystem`: ``ssh://hostname/path``29 """30 def cat(self, path_glob):31 """cat all files matching **path_glob**, decompressing if necessary"""32 for filename in self.ls(path_glob):33 for line in self._cat_file(filename):34 yield line35 def du(self, path_glob):36 """Get the total size of files matching ``path_glob``37 Corresponds roughly to: ``hadoop fs -dus path_glob``38 """39 raise NotImplementedError40 def ls(self, path_glob):41 """Recursively list all files in the given path.42 We don't return directories for compatibility with S3 (which43 has no concept of them)44 Corresponds roughly to: ``hadoop fs -lsr path_glob``45 """46 raise NotImplementedError47 def _cat_file(self, path):48 raise NotImplementedError49 def mkdir(self, path):50 """Create the given dir and its subdirs (if they don't already51 exist).52 Corresponds roughly to: ``hadoop fs -mkdir path``53 """54 raise NotImplementedError55 def path_exists(self, path_glob):56 """Does the given path exist?57 Corresponds roughly to: ``hadoop fs -test -e path_glob``58 """59 raise NotImplementedError60 def path_join(self, dirname, filename):61 raise NotImplementedError62 def rm(self, path_glob):63 """Recursively delete the given file/directory, if it exists64 Corresponds roughly to: ``hadoop fs -rmr path_glob``65 """66 raise NotImplementedError67 def touchz(self, path):68 """Make an empty file in the given location. Raises an error if69 a non-zero length file already exists in that location.70 Correponds to: ``hadoop fs -touchz path``71 """72 raise NotImplementedError73 def md5sum(self, path_glob):74 """Generate the md5 sum of the file at ``path``"""...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!