Join the 1-day Testing & QA Summit featuring 15+ Expert Speakers.Register for FREE! Join TestMu Conference

How to use parse_check_statistics method in pandera

Best Python code snippet using pandera_python

test_schema_statistics.py

Source:test_schema_statistics.py

...129        ],130        [{}, None],131    ],132)133def test_parse_check_statistics(check_stats, expectation) -> None:134    """Test that Checks are correctly parsed from check statistics."""135    if expectation is None:136        expectation = []137    checks = schema_statistics.parse_check_statistics(check_stats)138    if checks is None:139        checks = []140    assert set(checks) == set(expectation)141def _test_statistics(statistics, expectations):142    if not isinstance(statistics, list):143        statistics = [statistics]144    if not isinstance(expectations, list):145        expectations = [expectations]146    for stats, expectation in zip(statistics, expectations):147        stat_dtype = stats.pop("dtype")148        expectation_dtype = expectation.pop("dtype")149        assert stats == expectation150        assert expectation_dtype.check(stat_dtype)151@pytest.mark.parametrize(152    "series, expectation",153    [154        *[155            [156                pd.Series(157                    [1, 2, 3], dtype=str(pandas_engine.Engine.dtype(data_type))158                ),159                {160                    "dtype": pandas_engine.Engine.dtype(data_type),161                    "nullable": False,162                    "checks": {163                        "greater_than_or_equal_to": 1,164                        "less_than_or_equal_to": 3,165                    },166                    "name": None,167                },168            ]169            for data_type in NUMERIC_TYPES170        ],171        [172            pd.Series(["a", "b", "c", "a"], dtype="category"),173            {174                "dtype": pandas_engine.Engine.dtype(pa.Category),175                "nullable": False,176                "checks": {"isin": ["a", "b", "c"]},177                "name": None,178            },179        ],180        [181            pd.Series(["a", "b", "c", "a"], dtype="string", name="str_series"),182            {183                "dtype": pandas_engine.Engine.dtype("string"),184                "nullable": False,185                "checks": None,186                "name": "str_series",187            },188        ],189        [190            pd.Series(pd.to_datetime(["20180101", "20180102", "20180103"])),191            {192                "dtype": pandas_engine.Engine.dtype(pa.DateTime),193                "nullable": False,194                "checks": {195                    "greater_than_or_equal_to": pd.Timestamp("20180101"),196                    "less_than_or_equal_to": pd.Timestamp("20180103"),197                },198                "name": None,199            },200        ],201    ],202)203def test_infer_series_schema_statistics(series, expectation) -> None:204    """Test series statistics are correctly inferred."""205    statistics = schema_statistics.infer_series_statistics(series)206    _test_statistics(statistics, expectation)207@pytest.mark.parametrize(208    "null_index, series, expectation",209    [210        *[211            [212                0,213                pd.Series([1, 2, 3], dtype=str(data_type)),214                {215                    # introducing nans to integer arrays upcasts to float216                    "dtype": DEFAULT_FLOAT,217                    "nullable": True,218                    "checks": {219                        "greater_than_or_equal_to": 2,220                        "less_than_or_equal_to": 3,221                    },222                    "name": None,223                },224            ]225            for data_type in INTEGER_TYPES226        ],227        [228            # introducing nans to bool arrays upcasts to float except229            # for pandas >= 1.3.0230            0,231            pd.Series([True, False, True, False]),232            {233                "dtype": (234                    pandas_engine.Engine.dtype(pa.BOOL)235                    if pa.PANDAS_1_3_0_PLUS236                    else DEFAULT_FLOAT237                ),238                "nullable": True,239                "checks": (240                    None241                    if pa.PANDAS_1_3_0_PLUS242                    else {243                        "greater_than_or_equal_to": 0,244                        "less_than_or_equal_to": 1,245                    }246                ),247                "name": None,248            },249        ],250        [251            0,252            pd.Series(["a", "b", "c", "a"], dtype="category"),253            {254                "dtype": pandas_engine.Engine.dtype(pa.Category),255                "nullable": True,256                "checks": {"isin": ["a", "b", "c"]},257                "name": None,258            },259        ],260        [261            0,262            pd.Series(["a", "b", "c", "a"], name="str_series"),263            {264                "dtype": pandas_engine.Engine.dtype(str),265                "nullable": True,266                "checks": None,267                "name": "str_series",268            },269        ],270        [271            2,272            pd.Series(pd.to_datetime(["20180101", "20180102", "20180103"])),273            {274                "dtype": pandas_engine.Engine.dtype(pa.DateTime),275                "nullable": True,276                "checks": {277                    "greater_than_or_equal_to": pd.Timestamp("20180101"),278                    "less_than_or_equal_to": pd.Timestamp("20180102"),279                },280                "name": None,281            },282        ],283    ],284)285def test_infer_nullable_series_schema_statistics(286    null_index, series, expectation287):288    """Test nullable series statistics are correctly inferred."""289    series.iloc[null_index] = None290    statistics = schema_statistics.infer_series_statistics(series)291    _test_statistics(statistics, expectation)292@pytest.mark.parametrize(293    "index, expectation",294    [295        [296            pd.RangeIndex(20),297            [298                {299                    "name": None,300                    "dtype": DEFAULT_INT,301                    "nullable": False,302                    "checks": {303                        "greater_than_or_equal_to": 0,304                        "less_than_or_equal_to": 19,305                    },306                }307            ],308        ],309        [310            pd.Index([1, 2, 3], name="int_index"),311            [312                {313                    "name": "int_index",314                    "dtype": DEFAULT_INT,315                    "nullable": False,316                    "checks": {317                        "greater_than_or_equal_to": 1,318                        "less_than_or_equal_to": 3,319                    },320                }321            ],322        ],323        [324            pd.Index(["foo", "bar", "baz"], name="str_index"),325            [326                {327                    "name": "str_index",328                    "dtype": pandas_engine.Engine.dtype("object"),329                    "nullable": False,330                    "checks": None,331                },332            ],333        ],334        [335            pd.MultiIndex.from_arrays(336                [[10, 11, 12], pd.Series(["a", "b", "c"], dtype="category")],337                names=["int_index", "str_index"],338            ),339            [340                {341                    "name": "int_index",342                    "dtype": DEFAULT_INT,343                    "nullable": False,344                    "checks": {345                        "greater_than_or_equal_to": 10,346                        "less_than_or_equal_to": 12,347                    },348                },349                {350                    "name": "str_index",351                    "dtype": pandas_engine.Engine.dtype(pa.Category),352                    "nullable": False,353                    "checks": {"isin": ["a", "b", "c"]},354                },355            ],356        ],357        # UserWarning cases358        [1, UserWarning],359        ["foo", UserWarning],360        [{"foo": "bar"}, UserWarning],361        [["foo", "bar"], UserWarning],362        [pd.Series(["foo", "bar"]), UserWarning],363        [pd.DataFrame({"column": ["foo", "bar"]}), UserWarning],364    ],365)366def test_infer_index_statistics(index, expectation):367    """Test that index statistics are correctly inferred."""368    if expectation is UserWarning:369        with pytest.warns(UserWarning, match="^index type .+ not recognized"):370            schema_statistics.infer_index_statistics(index)371    else:372        _test_statistics(373            schema_statistics.infer_index_statistics(index), expectation374        )375def test_get_dataframe_schema_statistics():376    """Test that dataframe schema statistics logic is correct."""377    schema = pa.DataFrameSchema(378        columns={379            "int": pa.Column(380                int,381                checks=[382                    pa.Check.greater_than_or_equal_to(0),383                    pa.Check.less_than_or_equal_to(100),384                ],385                nullable=True,386            ),387            "float": pa.Column(388                float,389                checks=[390                    pa.Check.greater_than_or_equal_to(50),391                    pa.Check.less_than_or_equal_to(100),392                ],393            ),394            "str": pa.Column(395                str,396                checks=[pa.Check.isin(["foo", "bar", "baz"])],397            ),398        },399        index=pa.Index(400            int,401            checks=pa.Check.greater_than_or_equal_to(0),402            nullable=False,403            name="int_index",404        ),405    )406    expectation = {407        "checks": None,408        "columns": {409            "int": {410                "dtype": DEFAULT_INT,411                "checks": {412                    "greater_than_or_equal_to": {"min_value": 0},413                    "less_than_or_equal_to": {"max_value": 100},414                },415                "nullable": True,416                "unique": False,417                "coerce": False,418                "required": True,419                "regex": False,420            },421            "float": {422                "dtype": DEFAULT_FLOAT,423                "checks": {424                    "greater_than_or_equal_to": {"min_value": 50},425                    "less_than_or_equal_to": {"max_value": 100},426                },427                "nullable": False,428                "unique": False,429                "coerce": False,430                "required": True,431                "regex": False,432            },433            "str": {434                "dtype": pandas_engine.Engine.dtype(str),435                "checks": {"isin": {"allowed_values": ["foo", "bar", "baz"]}},436                "nullable": False,437                "unique": False,438                "coerce": False,439                "required": True,440                "regex": False,441            },442        },443        "index": [444            {445                "dtype": DEFAULT_INT,446                "checks": {"greater_than_or_equal_to": {"min_value": 0}},447                "nullable": False,448                "coerce": False,449                "name": "int_index",450            }451        ],452        "coerce": False,453    }454    statistics = schema_statistics.get_dataframe_schema_statistics(schema)455    assert statistics == expectation456def test_get_series_schema_statistics():457    """Test that series schema statistics logic is correct."""458    schema = pa.SeriesSchema(459        int,460        nullable=False,461        checks=[462            pa.Check.greater_than_or_equal_to(0),463            pa.Check.less_than_or_equal_to(100),464        ],465    )466    statistics = schema_statistics.get_series_schema_statistics(schema)467    assert statistics == {468        "dtype": pandas_engine.Engine.dtype(int),469        "nullable": False,470        "checks": {471            "greater_than_or_equal_to": {"min_value": 0},472            "less_than_or_equal_to": {"max_value": 100},473        },474        "name": None,475        "coerce": False,476    }477@pytest.mark.parametrize(478    "index_schema_component, expectation",479    [480        [481            pa.Index(482                int,483                checks=[484                    pa.Check.greater_than_or_equal_to(10),485                    pa.Check.less_than_or_equal_to(20),486                ],487                nullable=False,488                name="int_index",489            ),490            [491                {492                    "dtype": pandas_engine.Engine.dtype(int),493                    "nullable": False,494                    "checks": {495                        "greater_than_or_equal_to": {"min_value": 10},496                        "less_than_or_equal_to": {"max_value": 20},497                    },498                    "name": "int_index",499                    "coerce": False,500                }501            ],502        ]503    ],504)505def test_get_index_schema_statistics(index_schema_component, expectation):506    """Test that index schema statistics logic is correct."""507    statistics = schema_statistics.get_index_schema_statistics(508        index_schema_component509    )510    _test_statistics(statistics, expectation)511@pytest.mark.parametrize(512    "checks, expectation",513    [514        *[515            [[check], {check.name: check.statistics}]516            for check in [517                pa.Check.greater_than(1),518                pa.Check.less_than(1),519                pa.Check.in_range(1, 3),520                pa.Check.equal_to(1),521                pa.Check.not_equal_to(1),522                pa.Check.notin([1, 2, 3]),523                pa.Check.str_matches("foobar"),524                pa.Check.str_contains("foobar"),525                pa.Check.str_startswith("foobar"),526                pa.Check.str_endswith("foobar"),527                pa.Check.str_length(5, 10),528            ]529        ],530        # multiple checks at once531        [532            [533                pa.Check.greater_than_or_equal_to(10),534                pa.Check.less_than_or_equal_to(50),535                pa.Check.isin([10, 20, 30, 40, 50]),536            ],537            {538                "greater_than_or_equal_to": {"min_value": 10},539                "less_than_or_equal_to": {"max_value": 50},540                "isin": {"allowed_values": [10, 20, 30, 40, 50]},541            },542        ],543        # incompatible checks544        *[545            [546                [547                    pa.Check.greater_than_or_equal_to(min_value),548                    pa.Check.less_than_or_equal_to(max_value),549                ],550                ValueError,551            ]552            for min_value, max_value in [553                (5, 1),554                (10, 1),555                (100, 10),556                (1000, 100),557            ]558        ],559    ],560)561def test_parse_checks_and_statistics_roundtrip(checks, expectation):562    """563    Test that parse checks correctly obtain statistics from checks and564    vice-versa.565    """566    if expectation is ValueError:567        with pytest.raises(ValueError):568            schema_statistics.parse_checks(checks)569        return570    assert schema_statistics.parse_checks(checks) == expectation571    check_statistics = {check.name: check.statistics for check in checks}572    check_list = schema_statistics.parse_check_statistics(check_statistics)573    assert set(check_list) == set(checks)574# pylint: disable=unused-argument575def test_parse_checks_and_statistics_no_param(extra_registered_checks):576    """577    Ensure that an edge case where a check does not have parameters is578    appropriately handled.579    """580    checks = [pa.Check.no_param_check()]581    expectation = {"no_param_check": {}}582    assert schema_statistics.parse_checks(checks) == expectation583    check_statistics = {check.name: check.statistics for check in checks}584    check_list = schema_statistics.parse_check_statistics(check_statistics)585    assert set(check_list) == set(checks)...

schema_inference.py

Source:schema_inference.py

...28def _create_index(index_statistics):29    index = [30        Index(31            properties["dtype"],32            checks=parse_check_statistics(properties["checks"]),33            nullable=properties["nullable"],34            name=properties["name"],35        )36        for properties in index_statistics37    ]38    if len(index) == 1:39        index = index[0]  # type: ignore40    else:41        index = MultiIndex(index)  # type: ignore42    return index43def infer_dataframe_schema(df: pd.DataFrame) -> DataFrameSchema:44    """Infer a DataFrameSchema from a pandas DataFrame.45    :param df: DataFrame object to infer.46    :returns: DataFrameSchema47    """48    df_statistics = infer_dataframe_statistics(df)49    schema = DataFrameSchema(50        columns={51            colname: Column(52                properties["dtype"],53                checks=parse_check_statistics(properties["checks"]),54                nullable=properties["nullable"],55            )56            for colname, properties in df_statistics["columns"].items()57        },58        index=_create_index(df_statistics["index"]),59        coerce=True,60    )61    schema._is_inferred = True62    return schema63def infer_series_schema(series) -> SeriesSchema:64    """Infer a SeriesSchema from a pandas DataFrame.65    :param series: Series object to infer.66    :returns: SeriesSchema67    """68    series_statistics = infer_series_statistics(series)69    schema = SeriesSchema(70        dtype=series_statistics["dtype"],71        checks=parse_check_statistics(series_statistics["checks"]),72        nullable=series_statistics["nullable"],73        name=series_statistics["name"],74        coerce=True,75    )76    schema._is_inferred = True...

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.