Best Python code snippet using pandera_python
schemas.py
Source:schemas.py
...281 if column.regex:282 regex_dtype.update(283 {284 c: column.dtype285 for c in column.get_regex_columns(dataframe.columns)286 }287 )288 return {289 **{n: c.dtype for n, c in self.columns.items() if not c.regex},290 **regex_dtype,291 }292 @property293 def dtype(294 self,295 ) -> DataType:296 """Get the dtype property."""297 return self._dtype # type: ignore298 @dtype.setter299 def dtype(self, value: PandasDtypeInputTypes) -> None:300 """Set the pandas dtype property."""301 self._dtype = pandas_engine.Engine.dtype(value) if value else None302 def _coerce_dtype(self, obj: pd.DataFrame) -> pd.DataFrame:303 if self.dtype is None:304 raise ValueError(305 "dtype argument is None. Must specify this argument "306 "to coerce dtype"307 )308 try:309 return self.dtype.try_coerce(obj)310 except errors.ParserError as exc:311 raise errors.SchemaError(312 self,313 obj,314 (315 f"Error while coercing '{self.name}' to type "316 f"{self.dtype}: {exc}"317 ),318 failure_cases=exc.failure_cases,319 check=f"coerce_dtype('{self.dtype}')",320 ) from exc321 def coerce_dtype(self, obj: pd.DataFrame) -> pd.DataFrame:322 """Coerce dataframe to the type specified in dtype.323 :param obj: dataframe to coerce.324 :returns: dataframe with coerced dtypes325 """326 error_handler = SchemaErrorHandler(lazy=True)327 def _try_coercion(coerce_fn, obj):328 try:329 return coerce_fn(obj)330 except errors.SchemaError as exc:331 error_handler.collect_error("dtype_coercion_error", exc)332 return obj333 for colname, col_schema in self.columns.items():334 if col_schema.regex:335 try:336 matched_columns = col_schema.get_regex_columns(obj.columns)337 except errors.SchemaError:338 matched_columns = pd.Index([])339 for matched_colname in matched_columns:340 if col_schema.coerce or self.coerce:341 obj[matched_colname] = _try_coercion(342 col_schema.coerce_dtype, obj[matched_colname]343 )344 elif (345 (col_schema.coerce or self.coerce)346 and self.dtype is None347 and colname in obj348 ):349 obj[colname] = _try_coercion(350 col_schema.coerce_dtype, obj[colname]351 )352 if self.dtype is not None:353 obj = _try_coercion(self._coerce_dtype, obj)354 if self.index is not None and (self.index.coerce or self.coerce):355 index_schema = copy.deepcopy(self.index)356 if self.coerce:357 # coercing at the dataframe-level should apply index coercion358 # for both single- and multi-indexes.359 index_schema._coerce = True360 coerced_index = _try_coercion(index_schema.coerce_dtype, obj.index)361 if coerced_index is not None:362 obj.index = coerced_index363 if error_handler.collected_errors:364 raise errors.SchemaErrors(error_handler.collected_errors, obj)365 return obj366 def validate(367 self,368 check_obj: pd.DataFrame,369 head: Optional[int] = None,370 tail: Optional[int] = None,371 sample: Optional[int] = None,372 random_state: Optional[int] = None,373 lazy: bool = False,374 inplace: bool = False,375 ) -> pd.DataFrame:376 """Check if all columns in a dataframe have a column in the Schema.377 :param pd.DataFrame check_obj: the dataframe to be validated.378 :param head: validate the first n rows. Rows overlapping with `tail` or379 `sample` are de-duplicated.380 :param tail: validate the last n rows. Rows overlapping with `head` or381 `sample` are de-duplicated.382 :param sample: validate a random sample of n rows. Rows overlapping383 with `head` or `tail` are de-duplicated.384 :param random_state: random seed for the ``sample`` argument.385 :param lazy: if True, lazily evaluates dataframe against all validation386 checks and raises a ``SchemaErrors``. Otherwise, raise387 ``SchemaError`` as soon as one occurs.388 :param inplace: if True, applies coercion to the object of validation,389 otherwise creates a copy of the data.390 :returns: validated ``DataFrame``391 :raises SchemaError: when ``DataFrame`` violates built-in or custom392 checks.393 :example:394 Calling ``schema.validate`` returns the dataframe.395 >>> import pandas as pd396 >>> import pandera as pa397 >>>398 >>> df = pd.DataFrame({399 ... "probability": [0.1, 0.4, 0.52, 0.23, 0.8, 0.76],400 ... "category": ["dog", "dog", "cat", "duck", "dog", "dog"]401 ... })402 >>>403 >>> schema_withchecks = pa.DataFrameSchema({404 ... "probability": pa.Column(405 ... float, pa.Check(lambda s: (s >= 0) & (s <= 1))),406 ...407 ... # check that the "category" column contains a few discrete408 ... # values, and the majority of the entries are dogs.409 ... "category": pa.Column(410 ... str, [411 ... pa.Check(lambda s: s.isin(["dog", "cat", "duck"])),412 ... pa.Check(lambda s: (s == "dog").mean() > 0.5),413 ... ]),414 ... })415 >>>416 >>> schema_withchecks.validate(df)[["probability", "category"]]417 probability category418 0 0.10 dog419 1 0.40 dog420 2 0.52 cat421 3 0.23 duck422 4 0.80 dog423 5 0.76 dog424 """425 if not check_utils.is_table(check_obj):426 raise TypeError(f"expected pd.DataFrame, got {type(check_obj)}")427 if hasattr(check_obj, "dask"):428 # special case for dask dataframes429 if inplace:430 check_obj = check_obj.pandera.add_schema(self)431 else:432 check_obj = check_obj.copy()433 check_obj = check_obj.map_partitions(434 self._validate,435 head=head,436 tail=tail,437 sample=sample,438 random_state=random_state,439 lazy=lazy,440 inplace=inplace,441 meta=check_obj,442 )443 return check_obj.pandera.add_schema(self)444 return self._validate(445 check_obj=check_obj,446 head=head,447 tail=tail,448 sample=sample,449 random_state=random_state,450 lazy=lazy,451 inplace=inplace,452 )453 def _validate(454 self,455 check_obj: pd.DataFrame,456 head: Optional[int] = None,457 tail: Optional[int] = None,458 sample: Optional[int] = None,459 random_state: Optional[int] = None,460 lazy: bool = False,461 inplace: bool = False,462 ) -> pd.DataFrame:463 # pylint: disable=too-many-locals,too-many-branches,too-many-statements464 if self._is_inferred:465 warnings.warn(466 f"This {type(self)} is an inferred schema that hasn't been "467 "modified. It's recommended that you refine the schema "468 "by calling `add_columns`, `remove_columns`, or "469 "`update_columns` before using it to validate data.",470 UserWarning,471 )472 error_handler = SchemaErrorHandler(lazy)473 if not inplace:474 check_obj = check_obj.copy()475 if hasattr(check_obj, "pandera"):476 check_obj = check_obj.pandera.add_schema(self)477 # dataframe strictness check makes sure all columns in the dataframe478 # are specified in the dataframe schema479 if self.strict or self.ordered:480 column_names: List[Any] = []481 for col_name, col_schema in self.columns.items():482 if col_schema.regex:483 try:484 column_names.extend(485 col_schema.get_regex_columns(check_obj.columns)486 )487 except errors.SchemaError:488 pass489 elif col_name in check_obj.columns:490 column_names.append(col_name)491 # ordered "set" of columns492 sorted_column_names = iter(dict.fromkeys(column_names))493 expanded_column_names = frozenset(column_names)494 # drop adjacent duplicated column names495 if check_obj.columns.has_duplicates:496 columns = [k for k, _ in itertools.groupby(check_obj.columns)]497 else:498 columns = check_obj.columns499 for column in columns:...
schema_components.py
Source:schema_components.py
...182 lazy,183 inplace=inplace,184 )185 column_keys_to_check = (186 self.get_regex_columns(check_obj.columns)187 if self._regex188 else [self._name]189 )190 for column_name in column_keys_to_check:191 if self.coerce:192 check_obj[column_name] = self.coerce_dtype(193 check_obj[column_name]194 )195 if check_utils.is_table(check_obj[column_name]):196 for i in range(check_obj[column_name].shape[1]):197 validate_column(198 check_obj[column_name].iloc[:, [i]], column_name199 )200 else:201 validate_column(check_obj, column_name)202 return check_obj203 def get_regex_columns(204 self, columns: Union[pd.Index, pd.MultiIndex]205 ) -> Union[pd.Index, pd.MultiIndex]:206 """Get matching column names based on regex column name pattern.207 :param columns: columns to regex pattern match208 :returns: matchin columns209 """210 if isinstance(self.name, tuple):211 # handle MultiIndex case212 if len(self.name) != columns.nlevels:213 raise IndexError(214 f"Column regex name='{self.name}' is a tuple, expected a "215 f"MultiIndex columns with {len(self.name)} number of "216 f"levels, found {columns.nlevels} level(s)"217 )...
test_schema_components.py
Source:test_schema_components.py
...366 regex=True,367 )368 if error is not None:369 with pytest.raises(error):370 column_schema.get_regex_columns(columns)371 else:372 matched_columns = column_schema.get_regex_columns(columns)373 assert expected_matches == matched_columns.tolist()374INT_REGEX = r"-?\d+$"375FLOAT_REGEX = r"-?\d+\.\d+$"376DATETIME_REGEX = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}"377@pytest.mark.parametrize(378 "column_name_regex, expected_matches",379 [380 # match all381 [".+", [1, 2.2, 3.1415, -1, -3.6, pd.Timestamp("2018/01/01")]],382 # match integers383 [INT_REGEX, [1, -1]],384 # match floats385 [FLOAT_REGEX, [2.2, 3.1415, -3.6]],386 # match datetimes387 [DATETIME_REGEX, [pd.Timestamp("2018/01/01")]],388 ],389)390def test_column_regex_matching_non_str_types(391 column_name_regex: str, expected_matches: List392) -> None:393 """Non-string column names should be cast into str for regex matching."""394 columns = pd.Index([1, 2.2, 3.1415, -1, -3.6, pd.Timestamp("2018/01/01")])395 column_schema = Column(name=column_name_regex, regex=True)396 matched_columns = column_schema.get_regex_columns(columns)397 assert expected_matches == matched_columns.tolist()398@pytest.mark.parametrize(399 "column_name_regex, expected_matches",400 [401 # match all402 [403 (".+", ".+"),404 [405 ("foo", 1),406 ("foo", pd.Timestamp("2018/01/01")),407 (1, 2.2),408 (3.14, -1),409 ],410 ],411 # match (str, int)412 [("foo", INT_REGEX), [("foo", 1)]],413 # match (str, pd.Timestamp)414 [("foo", DATETIME_REGEX), [("foo", pd.Timestamp("2018/01/01"))]],415 # match (int, float)416 [(INT_REGEX, FLOAT_REGEX), [(1, 2.2)]],417 # match (float, int)418 [(FLOAT_REGEX, INT_REGEX), [(3.14, -1)]],419 ],420)421def test_column_regex_matching_non_str_types_multiindex(422 column_name_regex: Tuple[str, str], expected_matches: List[Tuple[Any, Any]]423) -> None:424 """425 Non-string column names should be cast into str for regex matching in426 MultiIndex column case.427 """428 columns = pd.MultiIndex.from_tuples(429 (430 ("foo", 1),431 ("foo", pd.Timestamp("2018/01/01")),432 (1, 2.2),433 (3.14, -1),434 )435 )436 column_schema = Column(name=column_name_regex, regex=True)437 matched_columns = column_schema.get_regex_columns(columns)438 assert expected_matches == matched_columns.tolist()439def test_column_regex_strict() -> None:440 """Test that Column regex patterns correctly parsed in DataFrameSchema."""441 data = pd.DataFrame(442 {443 "foo_1": [1, 2, 3],444 "foo_2": [1, 2, 3],445 "foo_3": [1, 2, 3],446 }447 )448 schema = DataFrameSchema(449 columns={"foo_*": Column(Int, regex=True)}, strict=True450 )451 assert isinstance(schema.validate(data), pd.DataFrame)...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!