Best Python code snippet using pandera_python
schema_components.py
Source:schema_components.py
1"""Components used in pandera schemas."""2import warnings3from copy import copy, deepcopy4from typing import Any, Dict, List, Optional, Tuple, Union5import numpy as np6import pandas as pd7from . import check_utils, errors8from . import strategies as st9from .deprecations import deprecate_pandas_dtype10from .error_handlers import SchemaErrorHandler11from .schemas import (12 CheckList,13 DataFrameSchema,14 PandasDtypeInputTypes,15 SeriesSchemaBase,16)17def _is_valid_multiindex_tuple_str(x: Tuple[Any, ...]) -> bool:18 """Check that a multi-index tuple key has all string elements"""19 return isinstance(x, tuple) and all(isinstance(i, str) for i in x)20class Column(SeriesSchemaBase):21 """Validate types and properties of DataFrame columns."""22 @deprecate_pandas_dtype23 def __init__(24 self,25 dtype: PandasDtypeInputTypes = None,26 checks: CheckList = None,27 nullable: bool = False,28 unique: bool = False,29 allow_duplicates: Optional[bool] = None,30 coerce: bool = False,31 required: bool = True,32 name: Union[str, Tuple[str, ...], None] = None,33 regex: bool = False,34 pandas_dtype: PandasDtypeInputTypes = None,35 title: Optional[str] = None,36 description: Optional[str] = None,37 ) -> None:38 """Create column validator object.39 :param dtype: datatype of the column. A ``PandasDtype`` for40 type-checking dataframe. If a string is specified, then assumes41 one of the valid pandas string values:42 http://pandas.pydata.org/pandas-docs/stable/basics.html#dtypes43 :param checks: checks to verify validity of the column44 :param nullable: Whether or not column can contain null values.45 :param unique: whether column values should be unique46 :param allow_duplicates: Whether or not column can contain duplicate47 values.48 .. warning::49 This option will be deprecated in 0.8.0. Use the ``unique``50 argument instead.51 :param coerce: If True, when schema.validate is called the column will52 be coerced into the specified dtype. This has no effect on columns53 where ``pandas_dtype=None``.54 :param required: Whether or not column is allowed to be missing55 :param name: column name in dataframe to validate.56 :param regex: whether the ``name`` attribute should be treated as a57 regex pattern to apply to multiple columns in a dataframe.58 :param pandas_dtype: alias of ``dtype`` for backwards compatibility.59 .. warning:: This option will be deprecated in 0.8.060 :param title: A human-readable label for the column.61 :param description: An arbitrary textual description of the column.62 :raises SchemaInitError: if impossible to build schema from parameters63 :example:64 >>> import pandas as pd65 >>> import pandera as pa66 >>>67 >>>68 >>> schema = pa.DataFrameSchema({69 ... "column": pa.Column(str)70 ... })71 >>>72 >>> schema.validate(pd.DataFrame({"column": ["foo", "bar"]}))73 column74 0 foo75 1 bar76 See :ref:`here<column>` for more usage details.77 """78 super().__init__(79 dtype,80 checks,81 nullable,82 unique,83 allow_duplicates,84 coerce,85 name,86 pandas_dtype,87 title,88 description,89 )90 if (91 name is not None92 and not isinstance(name, str)93 and not _is_valid_multiindex_tuple_str(name)94 and regex95 ):96 raise ValueError(97 "You cannot specify a non-string name when setting regex=True"98 )99 self.required = required100 self._name = name101 self._regex = regex102 @property103 def regex(self) -> bool:104 """True if ``name`` attribute should be treated as a regex pattern."""105 return self._regex106 @property107 def _allow_groupby(self) -> bool:108 """Whether the schema or schema component allows groupby operations."""109 return True110 @property111 def properties(self) -> Dict[str, Any]:112 """Get column properties."""113 return {114 "dtype": self.dtype,115 "checks": self._checks,116 "nullable": self._nullable,117 "unique": self._unique,118 "coerce": self._coerce,119 "required": self.required,120 "name": self._name,121 "regex": self._regex,122 "title": self.title,123 "description": self.description,124 }125 def set_name(self, name: str):126 """Used to set or modify the name of a column object.127 :param str name: the name of the column object128 """129 self._name = name130 return self131 def coerce_dtype(self, obj: Union[pd.DataFrame, pd.Series, pd.Index]):132 """Coerce dtype of a column, handling duplicate column names."""133 # pylint: disable=super-with-arguments134 if check_utils.is_field(obj) or check_utils.is_index(obj):135 return super(Column, self).coerce_dtype(obj)136 return obj.apply(137 lambda x: super(Column, self).coerce_dtype(x), axis="columns"138 )139 def validate(140 self,141 check_obj: pd.DataFrame,142 head: Optional[int] = None,143 tail: Optional[int] = None,144 sample: Optional[int] = None,145 random_state: Optional[int] = None,146 lazy: bool = False,147 inplace: bool = False,148 ) -> pd.DataFrame:149 """Validate a Column in a DataFrame object.150 :param check_obj: pandas DataFrame to validate.151 :param head: validate the first n rows. Rows overlapping with `tail` or152 `sample` are de-duplicated.153 :param tail: validate the last n rows. Rows overlapping with `head` or154 `sample` are de-duplicated.155 :param sample: validate a random sample of n rows. Rows overlapping156 with `head` or `tail` are de-duplicated.157 :param random_state: random seed for the ``sample`` argument.158 :param lazy: if True, lazily evaluates dataframe against all validation159 checks and raises a ``SchemaErrors``. Otherwise, raise160 ``SchemaError`` as soon as one occurs.161 :param inplace: if True, applies coercion to the object of validation,162 otherwise creates a copy of the data.163 :returns: validated DataFrame.164 """165 if not inplace:166 check_obj = check_obj.copy()167 if self._name is None:168 raise errors.SchemaError(169 self,170 check_obj,171 "column name is set to None. Pass the ``name` argument when "172 "initializing a Column object, or use the ``set_name`` "173 "method.",174 )175 def validate_column(check_obj, column_name):176 super(Column, copy(self).set_name(column_name)).validate(177 check_obj,178 head,179 tail,180 sample,181 random_state,182 lazy,183 inplace=inplace,184 )185 column_keys_to_check = (186 self.get_regex_columns(check_obj.columns)187 if self._regex188 else [self._name]189 )190 for column_name in column_keys_to_check:191 if self.coerce:192 check_obj[column_name] = self.coerce_dtype(193 check_obj[column_name]194 )195 if check_utils.is_table(check_obj[column_name]):196 for i in range(check_obj[column_name].shape[1]):197 validate_column(198 check_obj[column_name].iloc[:, [i]], column_name199 )200 else:201 validate_column(check_obj, column_name)202 return check_obj203 def get_regex_columns(204 self, columns: Union[pd.Index, pd.MultiIndex]205 ) -> Union[pd.Index, pd.MultiIndex]:206 """Get matching column names based on regex column name pattern.207 :param columns: columns to regex pattern match208 :returns: matchin columns209 """210 if isinstance(self.name, tuple):211 # handle MultiIndex case212 if len(self.name) != columns.nlevels:213 raise IndexError(214 f"Column regex name='{self.name}' is a tuple, expected a "215 f"MultiIndex columns with {len(self.name)} number of "216 f"levels, found {columns.nlevels} level(s)"217 )218 matches = np.ones(len(columns)).astype(bool)219 for i, name in enumerate(self.name):220 matched = pd.Index(221 columns.get_level_values(i).astype(str).str.match(name)222 ).fillna(False)223 matches = matches & np.array(matched.tolist())224 column_keys_to_check = columns[matches]225 else:226 if check_utils.is_multiindex(columns):227 raise IndexError(228 f"Column regex name {self.name} is a string, expected a "229 "dataframe where the index is a pd.Index object, not a "230 "pd.MultiIndex object"231 )232 column_keys_to_check = columns[233 # str.match will return nan values when the index value is234 # not a string.235 pd.Index(columns.astype(str).str.match(self.name))236 .fillna(False)237 .tolist()238 ]239 if column_keys_to_check.shape[0] == 0:240 raise errors.SchemaError(241 self,242 columns,243 f"Column regex name='{self.name}' did not match any columns "244 "in the dataframe. Update the regex pattern so that it "245 f"matches at least one column:\n{columns.tolist()}",246 )247 # drop duplicates to account for potential duplicated columns in the248 # dataframe.249 return column_keys_to_check.drop_duplicates()250 @st.strategy_import_error251 def strategy(self, *, size=None):252 """Create a ``hypothesis`` strategy for generating a Column.253 :param size: number of elements to generate254 :returns: a dataframe strategy for a single column.255 """256 return super().strategy(size=size).map(lambda x: x.to_frame())257 @st.strategy_import_error258 def strategy_component(self):259 """Generate column data object for use by DataFrame strategy."""260 return st.column_strategy(261 self.dtype,262 checks=self.checks,263 unique=self.unique,264 name=self.name,265 )266 def example(self, size=None) -> pd.DataFrame:267 """Generate an example of a particular size.268 :param size: number of elements in the generated Index.269 :returns: pandas DataFrame object.270 """271 # pylint: disable=import-outside-toplevel,cyclic-import,import-error272 import hypothesis273 with warnings.catch_warnings():274 warnings.simplefilter(275 "ignore",276 category=hypothesis.errors.NonInteractiveExampleWarning,277 )278 return (279 super()280 .strategy(size=size)281 .example()282 .rename(self.name)283 .to_frame()284 )285 def __eq__(self, other):286 if not isinstance(other, self.__class__):287 return NotImplemented288 def _compare_dict(obj):289 return {290 k: v if k != "_checks" else set(v)291 for k, v in obj.__dict__.items()292 }293 return _compare_dict(self) == _compare_dict(other)294class Index(SeriesSchemaBase):295 """Validate types and properties of a DataFrame Index."""296 @property297 def names(self):298 """Get index names in the Index schema component."""299 return [self.name]300 @property301 def _allow_groupby(self) -> bool:302 """Whether the schema or schema component allows groupby operations."""303 return False304 def validate(305 self,306 check_obj: Union[pd.DataFrame, pd.Series],307 head: Optional[int] = None,308 tail: Optional[int] = None,309 sample: Optional[int] = None,310 random_state: Optional[int] = None,311 lazy: bool = False,312 inplace: bool = False,313 ) -> Union[pd.DataFrame, pd.Series]:314 """Validate DataFrameSchema or SeriesSchema Index.315 :check_obj: pandas DataFrame of Series containing index to validate.316 :param head: validate the first n rows. Rows overlapping with `tail` or317 `sample` are de-duplicated.318 :param tail: validate the last n rows. Rows overlapping with `head` or319 `sample` are de-duplicated.320 :param sample: validate a random sample of n rows. Rows overlapping321 with `head` or `tail` are de-duplicated.322 :param random_state: random seed for the ``sample`` argument.323 :param lazy: if True, lazily evaluates dataframe against all validation324 checks and raises a ``SchemaErrors``. Otherwise, raise325 ``SchemaError`` as soon as one occurs.326 :param inplace: if True, applies coercion to the object of validation,327 otherwise creates a copy of the data.328 :returns: validated DataFrame or Series.329 """330 if check_utils.is_multiindex(check_obj.index):331 raise errors.SchemaError(332 self, check_obj, "Attempting to validate mismatch index"333 )334 series_cls = pd.Series335 # NOTE: this is a hack to get koalas working, this needs a more336 # principled implementation337 if type(check_obj).__module__ == "databricks.koalas.frame":338 # pylint: disable=import-outside-toplevel339 import databricks.koalas as ks340 series_cls = ks.Series341 if self.coerce:342 check_obj.index = self.coerce_dtype(check_obj.index)343 # handles case where pandas native string type is not supported344 # by index.345 obj_to_validate = self.dtype.coerce(346 series_cls(347 check_obj.index.to_numpy(), name=check_obj.index.name348 )349 )350 else:351 obj_to_validate = series_cls(352 check_obj.index.to_numpy(), name=check_obj.index.name353 )354 assert check_utils.is_field(355 super().validate(356 obj_to_validate,357 head,358 tail,359 sample,360 random_state,361 lazy,362 inplace,363 ),364 )365 return check_obj366 @st.strategy_import_error367 def strategy(self, *, size: int = None):368 """Create a ``hypothesis`` strategy for generating an Index.369 :param size: number of elements to generate.370 :returns: index strategy.371 """372 return st.index_strategy(373 self.dtype, # type: ignore374 checks=self.checks,375 nullable=self.nullable,376 unique=self.unique,377 name=self.name,378 size=size,379 )380 @st.strategy_import_error381 def strategy_component(self):382 """Generate column data object for use by MultiIndex strategy."""383 return st.column_strategy(384 self.dtype,385 checks=self.checks,386 unique=self.unique,387 name=self.name,388 )389 def example(self, size: int = None) -> pd.Index:390 """Generate an example of a particular size.391 :param size: number of elements in the generated Index.392 :returns: pandas Index object.393 """394 # pylint: disable=import-outside-toplevel,cyclic-import,import-error395 import hypothesis396 with warnings.catch_warnings():397 warnings.simplefilter(398 "ignore",399 category=hypothesis.errors.NonInteractiveExampleWarning,400 )401 return self.strategy(size=size).example()402 def __eq__(self, other):403 return self.__dict__ == other.__dict__404class MultiIndex(DataFrameSchema):405 """Validate types and properties of a DataFrame MultiIndex.406 This class inherits from :class:`~pandera.schemas.DataFrameSchema` to407 leverage its validation logic.408 """409 def __init__(410 self,411 indexes: List[Index],412 coerce: bool = False,413 strict: bool = False,414 name: str = None,415 ordered: bool = True,416 unique: Optional[Union[str, List[str]]] = None,417 ) -> None:418 """Create MultiIndex validator.419 :param indexes: list of Index validators for each level of the420 MultiIndex index.421 :param coerce: Whether or not to coerce the MultiIndex to the422 specified dtypes before validation423 :param strict: whether or not to accept columns in the MultiIndex that424 aren't defined in the ``indexes`` argument.425 :param name: name of schema component426 :param ordered: whether or not to validate the indexes order.427 :param unique: a list of index names that should be jointly unique.428 :example:429 >>> import pandas as pd430 >>> import pandera as pa431 >>>432 >>>433 >>> schema = pa.DataFrameSchema(434 ... columns={"column": pa.Column(int)},435 ... index=pa.MultiIndex([436 ... pa.Index(str,437 ... pa.Check(lambda s: s.isin(["foo", "bar"])),438 ... name="index0"),439 ... pa.Index(int, name="index1"),440 ... ])441 ... )442 >>>443 >>> df = pd.DataFrame(444 ... data={"column": [1, 2, 3]},445 ... index=pd.MultiIndex.from_arrays(446 ... [["foo", "bar", "foo"], [0, 1, 2]],447 ... names=["index0", "index1"],448 ... )449 ... )450 >>>451 >>> schema.validate(df)452 column453 index0 index1454 foo 0 1455 bar 1 2456 foo 2 3457 See :ref:`here<multiindex>` for more usage details.458 """459 if any(not isinstance(i, Index) for i in indexes):460 raise errors.SchemaInitError(461 f"expected a list of Index objects, found {indexes} "462 f"of type {[type(x) for x in indexes]}"463 )464 self.indexes = indexes465 columns = {}466 for i, index in enumerate(indexes):467 if not ordered and index.name is None:468 # if the MultiIndex is not ordered, there's no way of469 # determining how to get the index level without an explicit470 # index name471 raise errors.SchemaInitError(472 "You must specify index names if MultiIndex schema "473 "component is not ordered."474 )475 columns[i if index.name is None else index.name] = Column(476 dtype=index._dtype,477 checks=index.checks,478 nullable=index._nullable,479 unique=index._unique,480 )481 super().__init__(482 columns=columns,483 coerce=coerce,484 strict=strict,485 name=name,486 ordered=ordered,487 unique=unique,488 )489 @property490 def names(self):491 """Get index names in the MultiIndex schema component."""492 return [index.name for index in self.indexes]493 @property494 def coerce(self):495 """Whether or not to coerce data types."""496 return self._coerce or any(index.coerce for index in self.indexes)497 @coerce.setter498 def coerce(self, value: bool) -> None:499 """Set coerce attribute."""500 self._coerce = value501 def coerce_dtype(self, obj: pd.MultiIndex) -> pd.MultiIndex:502 """Coerce type of a pd.Series by type specified in dtype.503 :param obj: multi-index to coerce.504 :returns: ``MultiIndex`` with coerced data type505 """506 error_handler = SchemaErrorHandler(lazy=True)507 # construct MultiIndex with coerced data types508 coerced_multi_index = {}509 for i, index in enumerate(self.indexes):510 if all(x is None for x in self.names):511 index_levels = [i]512 else:513 index_levels = [514 i for i, name in enumerate(obj.names) if name == index.name515 ]516 for index_level in index_levels:517 index_array = obj.get_level_values(index_level)518 if index.coerce or self._coerce:519 try:520 index_array = index.coerce_dtype(index_array)521 except errors.SchemaError as err:522 error_handler.collect_error(523 "dtype_coercion_error", err524 )525 coerced_multi_index[index_level] = index_array526 if error_handler.collected_errors:527 raise errors.SchemaErrors(error_handler.collected_errors, obj)528 multiindex_cls = pd.MultiIndex529 # NOTE: this is a hack to support koalas530 if type(obj).__module__.startswith("databricks.koalas"):531 # pylint: disable=import-outside-toplevel532 import databricks.koalas as ks533 multiindex_cls = ks.MultiIndex534 return multiindex_cls.from_arrays(535 [536 v.to_numpy()537 for k, v in sorted(538 coerced_multi_index.items(), key=lambda x: x[0]539 )540 ],541 names=obj.names,542 )543 def validate(544 self,545 check_obj: Union[pd.DataFrame, pd.Series],546 head: Optional[int] = None,547 tail: Optional[int] = None,548 sample: Optional[int] = None,549 random_state: Optional[int] = None,550 lazy: bool = False,551 inplace: bool = False,552 ) -> Union[pd.DataFrame, pd.Series]:553 """Validate DataFrame or Series MultiIndex.554 :param check_obj: pandas DataFrame of Series to validate.555 :param head: validate the first n rows. Rows overlapping with `tail` or556 `sample` are de-duplicated.557 :param tail: validate the last n rows. Rows overlapping with `head` or558 `sample` are de-duplicated.559 :param sample: validate a random sample of n rows. Rows overlapping560 with `head` or `tail` are de-duplicated.561 :param random_state: random seed for the ``sample`` argument.562 :param lazy: if True, lazily evaluates dataframe against all validation563 checks and raises a ``SchemaErrors``. Otherwise, raise564 ``SchemaError`` as soon as one occurs.565 :param inplace: if True, applies coercion to the object of validation,566 otherwise creates a copy of the data.567 :returns: validated DataFrame or Series.568 """569 # pylint: disable=too-many-locals570 if self.coerce:571 try:572 check_obj.index = self.coerce_dtype(check_obj.index)573 except errors.SchemaErrors as err:574 if lazy:575 raise576 raise err.schema_errors[0]["error"] from err577 # Prevent data type coercion when the validate method is called because578 # it leads to some weird behavior when calling coerce_dtype within the579 # DataFrameSchema.validate call. Need to fix this by having MultiIndex580 # not inherit from DataFrameSchema.581 self_copy = deepcopy(self)582 self_copy.coerce = False583 for index in self_copy.indexes:584 index.coerce = False585 # rename integer-based column names in case of duplicate index names,586 # with at least one named index.587 if (588 not all(x is None for x in check_obj.index.names)589 and len(set(check_obj.index.names)) != check_obj.index.nlevels590 ):591 index_names = []592 for i, name in enumerate(check_obj.index.names):593 name = i if name is None else name594 if name not in index_names:595 index_names.append(name)596 columns = {}597 for name, (_, column) in zip(598 index_names, self_copy.columns.items()599 ):600 columns[name] = column.set_name(name)601 self_copy.columns = columns602 def to_dataframe(multiindex):603 """604 Emulate the behavior of pandas.MultiIndex.to_frame, but preserve605 duplicate index names if they exist.606 """607 # NOTE: this is a hack to support koalas608 if type(multiindex).__module__.startswith("databricks.koalas"):609 df = multiindex.to_frame()610 else:611 df = pd.DataFrame(612 {613 i: multiindex.get_level_values(i)614 for i in range(multiindex.nlevels)615 }616 )617 df.columns = [618 i if name is None else name619 for i, name in enumerate(multiindex.names)620 ]621 df.index = multiindex622 return df623 try:624 validation_result = super(MultiIndex, self_copy).validate(625 to_dataframe(check_obj.index),626 head,627 tail,628 sample,629 random_state,630 lazy,631 inplace,632 )633 except errors.SchemaErrors as err:634 # This is a hack to re-raise the SchemaErrors exception and change635 # the schema context to MultiIndex. This should be fixed by with636 # a more principled schema class hierarchy.637 schema_error_dicts = []638 for schema_error_dict in err.schema_errors:639 error = schema_error_dict["error"]640 error = errors.SchemaError(641 self,642 check_obj,643 error.args[0],644 error.failure_cases.assign(column=error.schema.name),645 error.check,646 error.check_index,647 )648 schema_error_dict["error"] = error649 schema_error_dicts.append(schema_error_dict)650 raise errors.SchemaErrors(schema_error_dicts, check_obj)651 assert check_utils.is_table(validation_result)652 return check_obj653 @st.strategy_import_error654 # NOTE: remove these ignore statements as part of655 # https://github.com/pandera-dev/pandera/issues/403656 # pylint: disable=arguments-differ657 def strategy(self, *, size=None): # type: ignore658 return st.multiindex_strategy(indexes=self.indexes, size=size)659 # NOTE: remove these ignore statements as part of660 # https://github.com/pandera-dev/pandera/issues/403661 # pylint: disable=arguments-differ662 def example(self, size=None) -> pd.MultiIndex: # type: ignore663 # pylint: disable=import-outside-toplevel,cyclic-import,import-error664 import hypothesis665 with warnings.catch_warnings():666 warnings.simplefilter(667 "ignore",668 category=hypothesis.errors.NonInteractiveExampleWarning,669 )670 return self.strategy(size=size).example()671 def __repr__(self):672 return (673 f"<Schema {self.__class__.__name__}("674 f"indexes={self.indexes}, "675 f"coerce={self.coerce}, "676 f"strict={self.strict}, "677 f"name={self.name}, "678 f"ordered={self.ordered}"679 ")>"680 )681 def __str__(self):682 indent = " " * 4683 indexes_str = "[\n"684 for index in self.indexes:685 indexes_str += f"{indent * 2}{index}\n"686 indexes_str += f"{indent}]"687 return (688 f"<Schema {self.__class__.__name__}(\n"689 f"{indent}indexes={indexes_str}\n"690 f"{indent}coerce={self.coerce},\n"691 f"{indent}strict={self.strict},\n"692 f"{indent}name={self.name},\n"693 f"{indent}ordered={self.ordered}\n"694 ")>"695 )696 def __eq__(self, other):...
model.py
Source:model.py
1"""Class-based api"""2import inspect3import os4import re5import sys6import typing7from typing import (8 Any,9 Callable,10 Dict,11 Iterable,12 List,13 Optional,14 Set,15 Tuple,16 Type,17 TypeVar,18 Union,19 cast,20)21import pandas as pd22from . import schema_components23from . import strategies as st24from .checks import Check25from .errors import SchemaInitError26from .json_schema import to_json_schema27from .model_components import (28 CHECK_KEY,29 DATAFRAME_CHECK_KEY,30 CheckInfo,31 Field,32 FieldCheckInfo,33 FieldInfo,34)35from .schemas import DataFrameSchema36from .typing import INDEX_TYPES, SERIES_TYPES, AnnotationInfo37from .typing.common import DataFrameBase38from .typing.config import BaseConfig39if sys.version_info[:2] < (3, 9):40 from typing_extensions import get_type_hints41else:42 from typing import get_type_hints43try:44 from pydantic.fields import ModelField # pylint:disable=unused-import45 HAS_PYDANTIC = True46except ImportError:47 HAS_PYDANTIC = False48SchemaIndex = Union[schema_components.Index, schema_components.MultiIndex]49_CONFIG_KEY = "Config"50MODEL_CACHE: Dict[Type["SchemaModel"], DataFrameSchema] = {}51F = TypeVar("F", bound=Callable)52TSchemaModel = TypeVar("TSchemaModel", bound="SchemaModel")53def docstring_substitution(*args: Any, **kwargs: Any) -> Callable[[F], F]:54 """Typed wrapper around pd.util.Substitution."""55 def decorator(func: F) -> F:56 return cast(F, pd.util.Substitution(*args, **kwargs)(func))57 return decorator58def _is_field(name: str) -> bool:59 """Ignore private and reserved keywords."""60 return not name.startswith("_") and name != _CONFIG_KEY61_config_options = [attr for attr in vars(BaseConfig) if _is_field(attr)]62def _extract_config_options_and_extras(63 config: Any,64) -> Tuple[Dict[str, Any], Dict[str, Any]]:65 config_options, extras = {}, {}66 for name, value in vars(config).items():67 if name in _config_options:68 config_options[name] = value69 elif _is_field(name):70 extras[name] = value71 # drop private/reserved keywords72 return config_options, extras73def _convert_extras_to_checks(extras: Dict[str, Any]) -> List[Check]:74 """75 New in GH#383.76 Any key not in BaseConfig keys is interpreted as defining a dataframe check. This function77 defines this conversion as follows:78 - Look up the key name in Check79 - If value is80 - tuple: interpret as args81 - dict: interpret as kwargs82 - anything else: interpret as the only argument to pass to Check83 """84 checks = []85 for name, value in extras.items():86 if isinstance(value, tuple):87 args, kwargs = value, {}88 elif isinstance(value, dict):89 args, kwargs = (), value90 else:91 args, kwargs = (value,), {}92 # dispatch directly to getattr to raise the correct exception93 checks.append(Check.__getattr__(name)(*args, **kwargs))94 return checks95class _MetaSchema(type):96 """Add string representations, mainly for pydantic."""97 def __repr__(cls):98 return str(cls)99 def __str__(cls):100 return cls.__name__101class SchemaModel(metaclass=_MetaSchema):102 """Definition of a :class:`~pandera.DataFrameSchema`.103 *new in 0.5.0*104 See the :ref:`User Guide <schema_models>` for more.105 """106 Config: Type[BaseConfig] = BaseConfig107 __extras__: Optional[Dict[str, Any]] = None108 __schema__: Optional[DataFrameSchema] = None109 __config__: Optional[Type[BaseConfig]] = None110 #: Key according to `FieldInfo.name`111 __fields__: Dict[str, Tuple[AnnotationInfo, FieldInfo]] = {}112 __checks__: Dict[str, List[Check]] = {}113 __dataframe_checks__: List[Check] = []114 # This is syntantic sugar that delegates to the validate method115 @docstring_substitution(validate_doc=DataFrameSchema.validate.__doc__)116 def __new__(cls, *args, **kwargs) -> DataFrameBase[TSchemaModel]: # type: ignore [misc]117 """%(validate_doc)s"""118 return cast(DataFrameBase[TSchemaModel], cls.validate(*args, **kwargs))119 def __init_subclass__(cls, **kwargs):120 """Ensure :class:`~pandera.model_components.FieldInfo` instances."""121 super().__init_subclass__(**kwargs)122 # pylint:disable=no-member123 subclass_annotations = cls.__dict__.get("__annotations__", {})124 for field_name in subclass_annotations.keys():125 if _is_field(field_name) and field_name not in cls.__dict__:126 # Field omitted127 field = Field()128 field.__set_name__(cls, field_name)129 setattr(cls, field_name, field)130 cls.__config__, cls.__extras__ = cls._collect_config_and_extras()131 @classmethod132 def to_schema(cls) -> DataFrameSchema:133 """Create :class:`~pandera.DataFrameSchema` from the :class:`.SchemaModel`."""134 if cls in MODEL_CACHE:135 return MODEL_CACHE[cls]136 mi_kwargs = {137 name[len("multiindex_") :]: value138 for name, value in vars(cls.__config__).items()139 if name.startswith("multiindex_")140 }141 cls.__fields__ = cls._collect_fields()142 check_infos = typing.cast(143 List[FieldCheckInfo], cls._collect_check_infos(CHECK_KEY)144 )145 cls.__checks__ = cls._extract_checks(146 check_infos, field_names=list(cls.__fields__.keys())147 )148 df_check_infos = cls._collect_check_infos(DATAFRAME_CHECK_KEY)149 df_custom_checks = cls._extract_df_checks(df_check_infos)150 df_registered_checks = _convert_extras_to_checks(151 {} if cls.__extras__ is None else cls.__extras__152 )153 cls.__dataframe_checks__ = df_custom_checks + df_registered_checks154 columns, index = cls._build_columns_index(155 cls.__fields__, cls.__checks__, **mi_kwargs156 )157 kwargs = {}158 if cls.__config__ is not None:159 kwargs = {160 "coerce": cls.__config__.coerce,161 "strict": cls.__config__.strict,162 "name": cls.__config__.name,163 "ordered": cls.__config__.ordered,164 "unique": cls.__config__.unique,165 "title": cls.__config__.title,166 "description": cls.__config__.description or cls.__doc__,167 }168 cls.__schema__ = DataFrameSchema(169 columns,170 index=index,171 checks=cls.__dataframe_checks__, # type: ignore172 **kwargs,173 )174 if cls not in MODEL_CACHE:175 MODEL_CACHE[cls] = cls.__schema__ # type: ignore176 return cls.__schema__ # type: ignore177 @classmethod178 def to_yaml(cls, stream: Optional[os.PathLike] = None):179 """180 Convert `Schema` to yaml using `io.to_yaml`.181 """182 return cls.to_schema().to_yaml(stream)183 @classmethod184 @docstring_substitution(validate_doc=DataFrameSchema.validate.__doc__)185 def validate(186 cls: Type[TSchemaModel],187 check_obj: pd.DataFrame,188 head: Optional[int] = None,189 tail: Optional[int] = None,190 sample: Optional[int] = None,191 random_state: Optional[int] = None,192 lazy: bool = False,193 inplace: bool = False,194 ) -> DataFrameBase[TSchemaModel]:195 """%(validate_doc)s"""196 return cast(197 DataFrameBase[TSchemaModel],198 cls.to_schema().validate(199 check_obj, head, tail, sample, random_state, lazy, inplace200 ),201 )202 @classmethod203 @docstring_substitution(strategy_doc=DataFrameSchema.strategy.__doc__)204 @st.strategy_import_error205 def strategy(cls: Type[TSchemaModel], *, size: Optional[int] = None):206 """%(strategy_doc)s"""207 return cls.to_schema().strategy(size=size)208 @classmethod209 @docstring_substitution(example_doc=DataFrameSchema.strategy.__doc__)210 @st.strategy_import_error211 def example(212 cls: Type[TSchemaModel], *, size: Optional[int] = None213 ) -> DataFrameBase[TSchemaModel]:214 """%(example_doc)s"""215 return cast(216 DataFrameBase[TSchemaModel], cls.to_schema().example(size=size)217 )218 @classmethod219 def _build_columns_index( # pylint:disable=too-many-locals220 cls,221 fields: Dict[str, Tuple[AnnotationInfo, FieldInfo]],222 checks: Dict[str, List[Check]],223 **multiindex_kwargs: Any,224 ) -> Tuple[225 Dict[str, schema_components.Column],226 Optional[Union[schema_components.Index, schema_components.MultiIndex]],227 ]:228 index_count = sum(229 annotation.origin in INDEX_TYPES230 for annotation, _ in fields.values()231 )232 columns: Dict[str, schema_components.Column] = {}233 indices: List[schema_components.Index] = []234 for field_name, (annotation, field) in fields.items():235 field_checks = checks.get(field_name, [])236 field_name = field.name237 check_name = getattr(field, "check_name", None)238 if annotation.metadata:239 if field.dtype_kwargs:240 raise TypeError(241 "Cannot specify redundant 'dtype_kwargs' "242 + f"for {annotation.raw_annotation}."243 + "\n Usage Tip: Drop 'typing.Annotated'."244 )245 dtype_kwargs = _get_dtype_kwargs(annotation)246 dtype = annotation.arg(**dtype_kwargs) # type: ignore247 elif annotation.default_dtype:248 dtype = annotation.default_dtype249 else:250 dtype = annotation.arg251 dtype = None if dtype is Any else dtype252 if (253 annotation.origin in SERIES_TYPES254 or annotation.raw_annotation in SERIES_TYPES255 ):256 col_constructor = (257 field.to_column if field else schema_components.Column258 )259 if check_name is False:260 raise SchemaInitError(261 f"'check_name' is not supported for {field_name}."262 )263 columns[field_name] = col_constructor( # type: ignore264 dtype,265 required=not annotation.optional,266 checks=field_checks,267 name=field_name,268 )269 elif (270 annotation.origin in INDEX_TYPES271 or annotation.raw_annotation in INDEX_TYPES272 ):273 if annotation.optional:274 raise SchemaInitError(275 f"Index '{field_name}' cannot be Optional."276 )277 if check_name is False or (278 # default single index279 check_name is None280 and index_count == 1281 ):282 field_name = None # type:ignore283 index_constructor = (284 field.to_index if field else schema_components.Index285 )286 index = index_constructor( # type: ignore287 dtype, checks=field_checks, name=field_name288 )289 indices.append(index)290 else:291 raise SchemaInitError(292 f"Invalid annotation '{field_name}: "293 f"{annotation.raw_annotation}'"294 )295 return columns, _build_schema_index(indices, **multiindex_kwargs)296 @classmethod297 def _get_model_attrs(cls) -> Dict[str, Any]:298 """Return all attributes.299 Similar to inspect.get_members but bypass descriptors __get__.300 """301 bases = inspect.getmro(cls)[:-1] # bases -> SchemaModel -> object302 attrs = {}303 for base in reversed(bases):304 attrs.update(base.__dict__)305 return attrs306 @classmethod307 def _collect_fields(cls) -> Dict[str, Tuple[AnnotationInfo, FieldInfo]]:308 """Centralize publicly named fields and their corresponding annotations."""309 annotations = get_type_hints( # pylint:disable=unexpected-keyword-arg310 cls, include_extras=True311 )312 attrs = cls._get_model_attrs()313 missing = []314 for name, attr in attrs.items():315 if inspect.isroutine(attr):316 continue317 if not _is_field(name):318 annotations.pop(name, None)319 elif name not in annotations:320 missing.append(name)321 if missing:322 raise SchemaInitError(f"Found missing annotations: {missing}")323 fields = {}324 for field_name, annotation in annotations.items():325 field = attrs[field_name] # __init_subclass__ guarantees existence326 if not isinstance(field, FieldInfo):327 raise SchemaInitError(328 f"'{field_name}' can only be assigned a 'Field', "329 + f"not a '{type(field)}.'"330 )331 fields[field.name] = (AnnotationInfo(annotation), field)332 return fields333 @classmethod334 def _collect_config_and_extras(335 cls,336 ) -> Tuple[Type[BaseConfig], Dict[str, Any]]:337 """Collect config options from bases, splitting off unknown options."""338 bases = inspect.getmro(cls)[:-1]339 bases = typing.cast(Tuple[Type[SchemaModel]], bases)340 root_model, *models = reversed(bases)341 options, extras = _extract_config_options_and_extras(root_model.Config)342 for model in models:343 config = getattr(model, _CONFIG_KEY, {})344 base_options, base_extras = _extract_config_options_and_extras(345 config346 )347 options.update(base_options)348 extras.update(base_extras)349 return type("Config", (BaseConfig,), options), extras350 @classmethod351 def _collect_check_infos(cls, key: str) -> List[CheckInfo]:352 """Collect inherited check metadata from bases.353 Inherited classmethods are not in cls.__dict__, that's why we need to354 walk the inheritance tree.355 """356 bases = inspect.getmro(cls)[:-2] # bases -> SchemaModel -> object357 bases = typing.cast(Tuple[Type[SchemaModel]], bases)358 method_names = set()359 check_infos = []360 for base in bases:361 for attr_name, attr_value in vars(base).items():362 check_info = getattr(attr_value, key, None)363 if not isinstance(check_info, CheckInfo):364 continue365 if attr_name in method_names: # check overridden by subclass366 continue367 method_names.add(attr_name)368 check_infos.append(check_info)369 return check_infos370 @classmethod371 def _extract_checks(372 cls, check_infos: List[FieldCheckInfo], field_names: List[str]373 ) -> Dict[str, List[Check]]:374 """Collect field annotations from bases in mro reverse order."""375 checks: Dict[str, List[Check]] = {}376 for check_info in check_infos:377 check_info_fields = {378 field.name if isinstance(field, FieldInfo) else field379 for field in check_info.fields380 }381 if check_info.regex:382 matched = _regex_filter(field_names, check_info_fields)383 else:384 matched = check_info_fields385 check_ = check_info.to_check(cls)386 for field in matched:387 if field not in field_names:388 raise SchemaInitError(389 f"Check {check_.name} is assigned to a non-existing field '{field}'."390 )391 if field not in checks:392 checks[field] = []393 checks[field].append(check_)394 return checks395 @classmethod396 def _extract_df_checks(cls, check_infos: List[CheckInfo]) -> List[Check]:397 """Collect field annotations from bases in mro reverse order."""398 return [check_info.to_check(cls) for check_info in check_infos]399 @classmethod400 def __get_validators__(cls):401 yield cls._pydantic_validate402 @classmethod403 def _pydantic_validate(cls, schema_model: Any) -> "SchemaModel":404 """Verify that the input is a compatible schema model."""405 if not inspect.isclass(schema_model): # type: ignore406 raise TypeError(f"{schema_model} is not a pandera.SchemaModel")407 if not issubclass(schema_model, cls): # type: ignore408 raise TypeError(f"{schema_model} does not inherit {cls}.")409 try:410 schema_model.to_schema()411 except SchemaInitError as exc:412 raise ValueError(413 f"Cannot use {cls} as a pydantic type as its "414 "SchemaModel cannot be converted to a DataFrameSchema.\n"415 f"Please revisit the model to address the following errors:"416 f"\n{exc}"417 ) from exc418 return cast("SchemaModel", schema_model)419 @classmethod420 def __modify_schema__(cls, field_schema):421 """Update pydantic field schema."""422 field_schema.update(to_json_schema(cls.to_schema()))423def _build_schema_index(424 indices: List[schema_components.Index], **multiindex_kwargs: Any425) -> Optional[SchemaIndex]:426 index: Optional[SchemaIndex] = None427 if indices:428 if len(indices) == 1:429 index = indices[0]430 else:431 index = schema_components.MultiIndex(indices, **multiindex_kwargs)432 return index433def _regex_filter(seq: Iterable, regexps: Iterable[str]) -> Set[str]:434 """Filter items matching at least one of the regexes."""435 matched: Set[str] = set()436 for regex in regexps:437 pattern = re.compile(regex)438 matched.update(filter(pattern.match, seq))439 return matched440def _get_dtype_kwargs(annotation: AnnotationInfo) -> Dict[str, Any]:441 sig = inspect.signature(annotation.arg) # type: ignore442 dtype_arg_names = list(sig.parameters.keys())443 if len(annotation.metadata) != len(dtype_arg_names): # type: ignore444 raise TypeError(445 f"Annotation '{annotation.arg.__name__}' requires " # type: ignore446 + f"all positional arguments {dtype_arg_names}."447 )...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!