Best Python code snippet using pandera_python
schemas.py
Source:schemas.py
...1571 failure_cases=exc.failure_cases,1572 check=f"coerce_dtype('{self.dtype}')",1573 ) from exc1574 @property1575 def _allow_groupby(self):1576 """Whether the schema or schema component allows groupby operations."""1577 raise NotImplementedError( # pragma: no cover1578 "The _allow_groupby property must be implemented by subclasses "1579 "of SeriesSchemaBase"1580 )1581 def validate(1582 self,1583 check_obj: Union[pd.DataFrame, pd.Series],1584 head: Optional[int] = None,1585 tail: Optional[int] = None,1586 sample: Optional[int] = None,1587 random_state: Optional[int] = None,1588 lazy: bool = False,1589 inplace: bool = False,1590 ) -> Union[pd.DataFrame, pd.Series]:1591 # pylint: disable=too-many-locals,too-many-branches,too-many-statements1592 """Validate a series or specific column in dataframe.1593 :check_obj: pandas DataFrame or Series to validate.1594 :param head: validate the first n rows. Rows overlapping with `tail` or1595 `sample` are de-duplicated.1596 :param tail: validate the last n rows. Rows overlapping with `head` or1597 `sample` are de-duplicated.1598 :param sample: validate a random sample of n rows. Rows overlapping1599 with `head` or `tail` are de-duplicated.1600 :param random_state: random seed for the ``sample`` argument.1601 :param lazy: if True, lazily evaluates dataframe against all validation1602 checks and raises a ``SchemaErrors``. Otherwise, raise1603 ``SchemaError`` as soon as one occurs.1604 :param inplace: if True, applies coercion to the object of validation,1605 otherwise creates a copy of the data.1606 :returns: validated DataFrame or Series.1607 """1608 if self._is_inferred:1609 warnings.warn(1610 f"This {type(self)} is an inferred schema that hasn't been "1611 "modified. It's recommended that you refine the schema "1612 "by calling `set_checks` before using it to validate data.",1613 UserWarning,1614 )1615 error_handler = SchemaErrorHandler(lazy)1616 if not inplace:1617 check_obj = check_obj.copy()1618 series = (1619 check_obj1620 if check_utils.is_field(check_obj)1621 else check_obj[self.name]1622 )1623 series = _pandas_obj_to_validate(1624 series, head, tail, sample, random_state1625 )1626 check_obj = _pandas_obj_to_validate(1627 check_obj, head, tail, sample, random_state1628 )1629 if self.name is not None and series.name != self._name:1630 msg = (1631 f"Expected {type(self)} to have name '{self._name}', found "1632 f"'{series.name}'"1633 )1634 error_handler.collect_error(1635 "wrong_field_name",1636 errors.SchemaError(1637 self,1638 check_obj,1639 msg,1640 failure_cases=scalar_failure_case(series.name),1641 check=f"field_name('{self._name}')",1642 ),1643 )1644 if not self._nullable:1645 nulls = series.isna()1646 if nulls.sum() > 0:1647 failed = series[nulls]1648 msg = (1649 f"non-nullable series '{series.name}' contains null "1650 f"values:\n{failed}"1651 )1652 error_handler.collect_error(1653 "series_contains_nulls",1654 errors.SchemaError(1655 self,1656 check_obj,1657 msg,1658 failure_cases=reshape_failure_cases(1659 series[nulls], ignore_na=False1660 ),1661 check="not_nullable",1662 ),1663 )1664 # Check if the series contains duplicate values1665 if self._unique:1666 if type(series).__module__.startswith("databricks.koalas"):1667 duplicates = (1668 series.to_frame().duplicated().reindex(series.index)1669 )1670 # pylint: disable=import-outside-toplevel1671 import databricks.koalas as ks1672 with ks.option_context("compute.ops_on_diff_frames", True):1673 failed = series[duplicates]1674 else:1675 duplicates = series.duplicated()1676 failed = series[duplicates]1677 if duplicates.any():1678 msg = (1679 f"series '{series.name}' contains duplicate values:\n"1680 f"{failed}"1681 )1682 error_handler.collect_error(1683 "series_contains_duplicates",1684 errors.SchemaError(1685 self,1686 check_obj,1687 msg,1688 failure_cases=reshape_failure_cases(failed),1689 check="field_uniqueness",1690 ),1691 )1692 if self._dtype is not None and (1693 not self._dtype.check(pandas_engine.Engine.dtype(series.dtype))1694 ):1695 msg = (1696 f"expected series '{series.name}' to have type {self._dtype}, "1697 + f"got {series.dtype}"1698 )1699 error_handler.collect_error(1700 "wrong_dtype",1701 errors.SchemaError(1702 self,1703 check_obj,1704 msg,1705 failure_cases=scalar_failure_case(str(series.dtype)),1706 check=f"dtype('{self.dtype}')",1707 ),1708 )1709 check_results = []1710 if check_utils.is_field(check_obj):1711 check_obj, check_args = series, [None]1712 else:1713 check_args = [self.name] # type: ignore1714 for check_index, check in enumerate(self.checks):1715 try:1716 check_results.append(1717 _handle_check_results(1718 self, check_index, check, check_obj, *check_args1719 )1720 )1721 except errors.SchemaError as err:1722 error_handler.collect_error("dataframe_check", err)1723 except Exception as err: # pylint: disable=broad-except1724 # catch other exceptions that may occur when executing the1725 # Check1726 err_msg = f'"{err.args[0]}"' if len(err.args) > 0 else ""1727 err_str = f"{err.__class__.__name__}({ err_msg})"1728 msg = (1729 f"Error while executing check function: {err_str}\n"1730 + traceback.format_exc()1731 )1732 error_handler.collect_error(1733 "check_error",1734 errors.SchemaError(1735 self,1736 check_obj,1737 msg,1738 failure_cases=scalar_failure_case(err_str),1739 check=check,1740 check_index=check_index,1741 ),1742 original_exc=err,1743 )1744 if lazy and error_handler.collected_errors:1745 raise errors.SchemaErrors(1746 error_handler.collected_errors, check_obj1747 )1748 assert all(check_results)1749 return check_obj1750 def __call__(1751 self,1752 check_obj: Union[pd.DataFrame, pd.Series],1753 head: Optional[int] = None,1754 tail: Optional[int] = None,1755 sample: Optional[int] = None,1756 random_state: Optional[int] = None,1757 lazy: bool = False,1758 inplace: bool = False,1759 ) -> Union[pd.DataFrame, pd.Series]:1760 """Alias for ``validate`` method."""1761 return self.validate(1762 check_obj, head, tail, sample, random_state, lazy, inplace1763 )1764 def __eq__(self, other):1765 return self.__dict__ == other.__dict__1766 @st.strategy_import_error1767 def strategy(self, *, size=None):1768 """Create a ``hypothesis`` strategy for generating a Series.1769 :param size: number of elements to generate1770 :returns: a strategy that generates pandas Series objects.1771 """1772 return st.series_strategy(1773 self.dtype,1774 checks=self.checks,1775 nullable=self.nullable,1776 unique=self.unique,1777 name=self.name,1778 size=size,1779 )1780 def example(self, size=None) -> pd.Series:1781 """Generate an example of a particular size.1782 :param size: number of elements in the generated Series.1783 :returns: pandas Series object.1784 """1785 # pylint: disable=import-outside-toplevel,cyclic-import,import-error1786 import hypothesis1787 with warnings.catch_warnings():1788 warnings.simplefilter(1789 "ignore",1790 category=hypothesis.errors.NonInteractiveExampleWarning,1791 )1792 return self.strategy(size=size).example()1793 def __repr__(self):1794 return (1795 f"<Schema {self.__class__.__name__}"1796 f"(name={self._name}, type={self.dtype!r})>"1797 )1798 @classmethod1799 def __get_validators__(cls):1800 yield cls._pydantic_validate1801 @classmethod1802 def _pydantic_validate( # type: ignore1803 cls: TSeriesSchemaBase, schema: Any1804 ) -> TSeriesSchemaBase:1805 """Verify that the input is a compatible DataFrameSchema."""1806 if not isinstance(schema, cls): # type: ignore1807 raise TypeError(f"{schema} is not a {cls}.")1808 return cast(TSeriesSchemaBase, schema)1809class SeriesSchema(SeriesSchemaBase):1810 """Series validator."""1811 @deprecate_pandas_dtype1812 def __init__(1813 self,1814 dtype: PandasDtypeInputTypes = None,1815 checks: CheckList = None,1816 index=None,1817 nullable: bool = False,1818 unique: bool = False,1819 allow_duplicates: Optional[bool] = None,1820 coerce: bool = False,1821 name: str = None,1822 pandas_dtype: PandasDtypeInputTypes = None,1823 title: Optional[str] = None,1824 description: Optional[str] = None,1825 ) -> None:1826 """Initialize series schema base object.1827 :param dtype: datatype of the column. If a string is specified,1828 then assumes one of the valid pandas string values:1829 http://pandas.pydata.org/pandas-docs/stable/basics.html#dtypes1830 :param checks: If element_wise is True, then callable signature should1831 be:1832 ``Callable[Any, bool]`` where the ``Any`` input is a scalar element1833 in the column. Otherwise, the input is assumed to be a1834 pandas.Series object.1835 :param index: specify the datatypes and properties of the index.1836 :param nullable: Whether or not column can contain null values.1837 :param unique: Whether or not column can contain duplicate1838 values.1839 :param allow_duplicates: Whether or not column can contain duplicate1840 values.1841 .. warning::1842 This option will be deprecated in 0.8.0. Use the ``unique``1843 argument instead.1844 :param coerce: If True, when schema.validate is called the column will1845 be coerced into the specified dtype. This has no effect on columns1846 where ``pandas_dtype=None``.1847 :param name: series name.1848 :param pandas_dtype: alias of ``dtype`` for backwards compatibility.1849 :param title: A human-readable label for the series.1850 :param description: An arbitrary textual description of the series.1851 .. warning:: This option will be deprecated in 0.8.01852 """1853 super().__init__(1854 dtype,1855 checks,1856 nullable,1857 unique,1858 allow_duplicates,1859 coerce,1860 name,1861 pandas_dtype,1862 title,1863 description,1864 )1865 self.index = index1866 @property1867 def _allow_groupby(self) -> bool:1868 """Whether the schema or schema component allows groupby operations."""1869 return False1870 def validate(1871 self,1872 check_obj: pd.Series,1873 head: Optional[int] = None,1874 tail: Optional[int] = None,1875 sample: Optional[int] = None,1876 random_state: Optional[int] = None,1877 lazy: bool = False,1878 inplace: bool = False,1879 ) -> pd.Series:1880 """Validate a Series object.1881 :param check_obj: One-dimensional ndarray with axis labels...
schema_components.py
Source:schema_components.py
...103 def regex(self) -> bool:104 """True if ``name`` attribute should be treated as a regex pattern."""105 return self._regex106 @property107 def _allow_groupby(self) -> bool:108 """Whether the schema or schema component allows groupby operations."""109 return True110 @property111 def properties(self) -> Dict[str, Any]:112 """Get column properties."""113 return {114 "dtype": self.dtype,115 "checks": self._checks,116 "nullable": self._nullable,117 "unique": self._unique,118 "coerce": self._coerce,119 "required": self.required,120 "name": self._name,121 "regex": self._regex,122 "title": self.title,123 "description": self.description,124 }125 def set_name(self, name: str):126 """Used to set or modify the name of a column object.127 :param str name: the name of the column object128 """129 self._name = name130 return self131 def coerce_dtype(self, obj: Union[pd.DataFrame, pd.Series, pd.Index]):132 """Coerce dtype of a column, handling duplicate column names."""133 # pylint: disable=super-with-arguments134 if check_utils.is_field(obj) or check_utils.is_index(obj):135 return super(Column, self).coerce_dtype(obj)136 return obj.apply(137 lambda x: super(Column, self).coerce_dtype(x), axis="columns"138 )139 def validate(140 self,141 check_obj: pd.DataFrame,142 head: Optional[int] = None,143 tail: Optional[int] = None,144 sample: Optional[int] = None,145 random_state: Optional[int] = None,146 lazy: bool = False,147 inplace: bool = False,148 ) -> pd.DataFrame:149 """Validate a Column in a DataFrame object.150 :param check_obj: pandas DataFrame to validate.151 :param head: validate the first n rows. Rows overlapping with `tail` or152 `sample` are de-duplicated.153 :param tail: validate the last n rows. Rows overlapping with `head` or154 `sample` are de-duplicated.155 :param sample: validate a random sample of n rows. Rows overlapping156 with `head` or `tail` are de-duplicated.157 :param random_state: random seed for the ``sample`` argument.158 :param lazy: if True, lazily evaluates dataframe against all validation159 checks and raises a ``SchemaErrors``. Otherwise, raise160 ``SchemaError`` as soon as one occurs.161 :param inplace: if True, applies coercion to the object of validation,162 otherwise creates a copy of the data.163 :returns: validated DataFrame.164 """165 if not inplace:166 check_obj = check_obj.copy()167 if self._name is None:168 raise errors.SchemaError(169 self,170 check_obj,171 "column name is set to None. Pass the ``name` argument when "172 "initializing a Column object, or use the ``set_name`` "173 "method.",174 )175 def validate_column(check_obj, column_name):176 super(Column, copy(self).set_name(column_name)).validate(177 check_obj,178 head,179 tail,180 sample,181 random_state,182 lazy,183 inplace=inplace,184 )185 column_keys_to_check = (186 self.get_regex_columns(check_obj.columns)187 if self._regex188 else [self._name]189 )190 for column_name in column_keys_to_check:191 if self.coerce:192 check_obj[column_name] = self.coerce_dtype(193 check_obj[column_name]194 )195 if check_utils.is_table(check_obj[column_name]):196 for i in range(check_obj[column_name].shape[1]):197 validate_column(198 check_obj[column_name].iloc[:, [i]], column_name199 )200 else:201 validate_column(check_obj, column_name)202 return check_obj203 def get_regex_columns(204 self, columns: Union[pd.Index, pd.MultiIndex]205 ) -> Union[pd.Index, pd.MultiIndex]:206 """Get matching column names based on regex column name pattern.207 :param columns: columns to regex pattern match208 :returns: matchin columns209 """210 if isinstance(self.name, tuple):211 # handle MultiIndex case212 if len(self.name) != columns.nlevels:213 raise IndexError(214 f"Column regex name='{self.name}' is a tuple, expected a "215 f"MultiIndex columns with {len(self.name)} number of "216 f"levels, found {columns.nlevels} level(s)"217 )218 matches = np.ones(len(columns)).astype(bool)219 for i, name in enumerate(self.name):220 matched = pd.Index(221 columns.get_level_values(i).astype(str).str.match(name)222 ).fillna(False)223 matches = matches & np.array(matched.tolist())224 column_keys_to_check = columns[matches]225 else:226 if check_utils.is_multiindex(columns):227 raise IndexError(228 f"Column regex name {self.name} is a string, expected a "229 "dataframe where the index is a pd.Index object, not a "230 "pd.MultiIndex object"231 )232 column_keys_to_check = columns[233 # str.match will return nan values when the index value is234 # not a string.235 pd.Index(columns.astype(str).str.match(self.name))236 .fillna(False)237 .tolist()238 ]239 if column_keys_to_check.shape[0] == 0:240 raise errors.SchemaError(241 self,242 columns,243 f"Column regex name='{self.name}' did not match any columns "244 "in the dataframe. Update the regex pattern so that it "245 f"matches at least one column:\n{columns.tolist()}",246 )247 # drop duplicates to account for potential duplicated columns in the248 # dataframe.249 return column_keys_to_check.drop_duplicates()250 @st.strategy_import_error251 def strategy(self, *, size=None):252 """Create a ``hypothesis`` strategy for generating a Column.253 :param size: number of elements to generate254 :returns: a dataframe strategy for a single column.255 """256 return super().strategy(size=size).map(lambda x: x.to_frame())257 @st.strategy_import_error258 def strategy_component(self):259 """Generate column data object for use by DataFrame strategy."""260 return st.column_strategy(261 self.dtype,262 checks=self.checks,263 unique=self.unique,264 name=self.name,265 )266 def example(self, size=None) -> pd.DataFrame:267 """Generate an example of a particular size.268 :param size: number of elements in the generated Index.269 :returns: pandas DataFrame object.270 """271 # pylint: disable=import-outside-toplevel,cyclic-import,import-error272 import hypothesis273 with warnings.catch_warnings():274 warnings.simplefilter(275 "ignore",276 category=hypothesis.errors.NonInteractiveExampleWarning,277 )278 return (279 super()280 .strategy(size=size)281 .example()282 .rename(self.name)283 .to_frame()284 )285 def __eq__(self, other):286 if not isinstance(other, self.__class__):287 return NotImplemented288 def _compare_dict(obj):289 return {290 k: v if k != "_checks" else set(v)291 for k, v in obj.__dict__.items()292 }293 return _compare_dict(self) == _compare_dict(other)294class Index(SeriesSchemaBase):295 """Validate types and properties of a DataFrame Index."""296 @property297 def names(self):298 """Get index names in the Index schema component."""299 return [self.name]300 @property301 def _allow_groupby(self) -> bool:302 """Whether the schema or schema component allows groupby operations."""303 return False304 def validate(305 self,306 check_obj: Union[pd.DataFrame, pd.Series],307 head: Optional[int] = None,308 tail: Optional[int] = None,309 sample: Optional[int] = None,310 random_state: Optional[int] = None,311 lazy: bool = False,312 inplace: bool = False,313 ) -> Union[pd.DataFrame, pd.Series]:314 """Validate DataFrameSchema or SeriesSchema Index.315 :check_obj: pandas DataFrame of Series containing index to validate....
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!