Best Python code snippet using keyboard
test_shuffle.py
Source:test_shuffle.py
...114 df = pd.DataFrame({'x': np.random.random(100),115 'y': np.random.random(100) // 0.2},116 index=np.random.random(100))117 ddf = dd.from_pandas(df, npartitions=npartitions)118 assert_eq(df.set_index('x'),119 ddf.set_index('x', shuffle='tasks'))120 assert_eq(df.set_index('y'),121 ddf.set_index('y', shuffle='tasks'))122 assert_eq(df.set_index(df.x),123 ddf.set_index(ddf.x, shuffle='tasks'))124 assert_eq(df.set_index(df.x + df.y),125 ddf.set_index(ddf.x + ddf.y, shuffle='tasks'))126 assert_eq(df.set_index(df.x + 1),127 ddf.set_index(ddf.x + 1, shuffle='tasks'))128 assert_eq(df.set_index(df.index),129 ddf.set_index(ddf.index, shuffle='tasks'))130@pytest.mark.parametrize('shuffle', ['disk', 'tasks'])131def test_set_index_self_index(shuffle):132 df = pd.DataFrame({'x': np.random.random(100),133 'y': np.random.random(100) // 0.2},134 index=np.random.random(100))135 a = dd.from_pandas(df, npartitions=4)136 b = a.set_index(a.index, shuffle=shuffle)137 assert a is b138 assert_eq(b, df.set_index(df.index))139@pytest.mark.parametrize('shuffle', ['tasks'])140def test_set_index_names(shuffle):141 df = pd.DataFrame({'x': np.random.random(100),142 'y': np.random.random(100) // 0.2},143 index=np.random.random(100))144 ddf = dd.from_pandas(df, npartitions=4)145 assert (set(ddf.set_index('x', shuffle=shuffle).dask) ==146 set(ddf.set_index('x', shuffle=shuffle).dask))147 assert (set(ddf.set_index('x', shuffle=shuffle).dask) !=148 set(ddf.set_index('y', shuffle=shuffle).dask))149 assert (set(ddf.set_index('x', max_branch=4, shuffle=shuffle).dask) !=150 set(ddf.set_index('x', max_branch=3, shuffle=shuffle).dask))151 assert (set(ddf.set_index('x', drop=True, shuffle=shuffle).dask) !=152 set(ddf.set_index('x', drop=False, shuffle=shuffle).dask))153@pytest.mark.parametrize('shuffle', ['disk', 'tasks'])154def test_set_index_tasks_2(shuffle):155 df = dd.demo.make_timeseries(156 '2000', '2004', {'value': float, 'name': str, 'id': int},157 freq='2H', partition_freq='1M', seed=1)158 df2 = df.set_index('name', shuffle=shuffle)159 df2.value.sum().compute(get=dask.get)160@pytest.mark.parametrize('shuffle', ['disk', 'tasks'])161def test_set_index_tasks_3(shuffle):162 df = pd.DataFrame(np.random.random((10, 2)), columns=['x', 'y'])163 ddf = dd.from_pandas(df, npartitions=5)164 ddf2 = ddf.set_index('x', shuffle=shuffle, max_branch=2,165 npartitions=ddf.npartitions)166 df2 = df.set_index('x')167 assert_eq(df2, ddf2)168 assert ddf2.npartitions == ddf.npartitions169@pytest.mark.parametrize('shuffle', ['tasks', 'disk'])170def test_shuffle_sort(shuffle):171 df = pd.DataFrame({'x': [1, 2, 3, 2, 1], 'y': [9, 8, 7, 1, 5]})172 ddf = dd.from_pandas(df, npartitions=3)173 df2 = df.set_index('x').sort_index()174 ddf2 = ddf.set_index('x', shuffle=shuffle)175 assert_eq(ddf2.loc[2:3], df2.loc[2:3])176@pytest.mark.parametrize('shuffle', ['tasks', 'disk'])177@pytest.mark.parametrize('get', [threaded_get, mp_get])178def test_rearrange(shuffle, get):179 df = pd.DataFrame({'x': np.random.random(10)})180 ddf = dd.from_pandas(df, npartitions=4)181 ddf2 = ddf.assign(y=ddf.x % 4)182 result = rearrange_by_column(ddf2, 'y', max_branch=32, shuffle=shuffle)183 assert result.npartitions == ddf.npartitions184 assert set(ddf.dask).issubset(result.dask)185 # Every value in exactly one partition186 a = result.compute(get=get)187 parts = get(result.dask, result._keys())188 for i in a.y.drop_duplicates():189 assert sum(i in part.y for part in parts) == 1190def test_rearrange_by_column_with_narrow_divisions():191 from dask.dataframe.tests.test_multi import list_eq192 A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]})193 a = dd.repartition(A, [0, 4, 5])194 df = rearrange_by_divisions(a, 'x', (0, 2, 5))195 list_eq(df, a)196def test_maybe_buffered_partd():197 import partd198 f = maybe_buffered_partd()199 p1 = f()200 assert isinstance(p1.partd, partd.Buffer)201 f2 = pickle.loads(pickle.dumps(f))202 assert not f2.buffer203 p2 = f2()204 assert isinstance(p2.partd, partd.File)205def test_set_index_with_explicit_divisions():206 df = pd.DataFrame({'x': [4, 1, 2, 5]}, index=[10, 20, 30, 40])207 ddf = dd.from_pandas(df, npartitions=2)208 def throw(*args, **kwargs):209 raise Exception()210 with dask.set_options(get=throw):211 ddf2 = ddf.set_index('x', divisions=[1, 3, 5])212 assert ddf2.divisions == (1, 3, 5)213 df2 = df.set_index('x')214 assert_eq(ddf2, df2)215 # Divisions must be sorted216 with pytest.raises(ValueError):217 ddf.set_index('x', divisions=[3, 1, 5])218def test_set_index_divisions_2():219 df = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': list('abdabd')})220 ddf = dd.from_pandas(df, 2)221 result = ddf.set_index('y', divisions=['a', 'c', 'd'])222 assert result.divisions == ('a', 'c', 'd')223 assert list(result.compute(get=dask.get).index[-2:]) == ['d', 'd']224def test_set_index_divisions_compute():225 d2 = d.set_index('b', divisions=[0, 2, 9], compute=False)226 d3 = d.set_index('b', divisions=[0, 2, 9], compute=True)227 assert_eq(d2, d3)228 assert_eq(d2, full.set_index('b'))229 assert_eq(d3, full.set_index('b'))230 assert len(d2.dask) > len(d3.dask)231 d4 = d.set_index(d.b, divisions=[0, 2, 9], compute=False)232 d5 = d.set_index(d.b, divisions=[0, 2, 9], compute=True)233 exp = full.copy()234 exp.index = exp.b235 assert_eq(d4, d5)236 assert_eq(d4, exp)237 assert_eq(d5, exp)238 assert len(d4.dask) > len(d5.dask)239def test_set_index_divisions_sorted():240 p1 = pd.DataFrame({'x': [10, 11, 12], 'y': ['a', 'a', 'a']})241 p2 = pd.DataFrame({'x': [13, 14, 15], 'y': ['b', 'b', 'c']})242 p3 = pd.DataFrame({'x': [16, 17, 18], 'y': ['d', 'e', 'e']})243 ddf = dd.DataFrame({('x', 0): p1, ('x', 1): p2, ('x', 2): p3},244 'x', p1, [None, None, None, None])245 df = ddf.compute()246 def throw(*args, **kwargs):247 raise Exception("Shouldn't have computed")248 with dask.set_options(get=throw):249 res = ddf.set_index('x', divisions=[10, 13, 16, 18], sorted=True)250 assert_eq(res, df.set_index('x'))251 with dask.set_options(get=throw):252 res = ddf.set_index('y', divisions=['a', 'b', 'd', 'e'], sorted=True)253 assert_eq(res, df.set_index('y'))254 # with sorted=True, divisions must be same length as df.divisions255 with pytest.raises(ValueError):256 ddf.set_index('y', divisions=['a', 'b', 'c', 'd', 'e'], sorted=True)257 # Divisions must be sorted258 with pytest.raises(ValueError):259 ddf.set_index('y', divisions=['a', 'b', 'd', 'c'], sorted=True)260@pytest.mark.parametrize('shuffle', ['disk', 'tasks'])261def test_set_index_reduces_partitions_small(shuffle):262 df = pd.DataFrame({'x': np.random.random(100)})263 ddf = dd.from_pandas(df, npartitions=50)264 ddf2 = ddf.set_index('x', shuffle=shuffle, npartitions='auto')265 assert ddf2.npartitions < 10266@pytest.mark.parametrize('shuffle', ['disk', 'tasks'])267def test_set_index_reduces_partitions_large(shuffle):268 n = 2**24269 df = pd.DataFrame({'x': np.random.random(n),270 'y': np.random.random(n),271 'z': np.random.random(n)})272 ddf = dd.from_pandas(df, npartitions=50, name='x', sort=False)273 ddf2 = ddf.set_index('x', shuffle=shuffle, npartitions='auto')274 assert 1 < ddf2.npartitions < 20275@pytest.mark.parametrize('shuffle', ['disk', 'tasks'])276def test_set_index_doesnt_increase_partitions(shuffle):277 n = 2**24278 df = pd.DataFrame({'x': np.random.random(n),279 'y': np.random.random(n),280 'z': np.random.random(n)})281 ddf = dd.from_pandas(df, npartitions=2, name='x', sort=False)282 ddf2 = ddf.set_index('x', shuffle=shuffle, npartitions='auto')283 assert ddf2.npartitions <= ddf.npartitions284@pytest.mark.parametrize('shuffle', ['disk', 'tasks'])285def test_set_index_detects_sorted_data(shuffle):286 df = pd.DataFrame({'x': range(100), 'y': range(100)})287 ddf = dd.from_pandas(df, npartitions=10, name='x', sort=False)288 ddf2 = ddf.set_index('x', shuffle=shuffle)289 assert len(ddf2.dask) < ddf.npartitions * 4290def test_set_index_sorts():291 # https://github.com/dask/dask/issues/2288292 vals = np.array([1348550149000000000, 1348550149000000000, 1348558142000000000,293 1348558142000000000, 1348585928000000000, 1348585928000000000,294 1348600739000000000, 1348601706000000000, 1348600739000000000,295 1348601706000000000, 1348614789000000000, 1348614789000000000,296 1348621037000000000, 1348621038000000000, 1348621040000000000,297 1348621037000000000, 1348621038000000000, 1348621040000000000,298 1348637628000000000, 1348638159000000000, 1348638160000000000,299 1348638159000000000, 1348638160000000000, 1348637628000000000,300 1348646354000000000, 1348646354000000000, 1348659107000000000,301 1348657111000000000, 1348659107000000000, 1348657111000000000,302 1348672876000000000, 1348672876000000000, 1348682787000000000,303 1348681985000000000, 1348682787000000000, 1348681985000000000,304 1348728167000000000, 1348728167000000000, 1348730745000000000,305 1348730745000000000, 1348750198000000000, 1348750198000000000,306 1348750198000000000, 1348753539000000000, 1348753539000000000,307 1348753539000000000, 1348754449000000000, 1348754449000000000,308 1348761333000000000, 1348761554000000000, 1348761610000000000,309 1348761333000000000, 1348761554000000000, 1348761610000000000,310 1348782624000000000, 1348782624000000000, 1348782624000000000,311 1348782624000000000])312 vals = pd.to_datetime(vals, unit='ns')313 breaks = [10, 36, 58]314 dfs = []315 for i in range(len(breaks)):316 lo = sum(breaks[:i])317 hi = sum(breaks[i:i + 1])318 dfs.append(pd.DataFrame({"timestamp": vals[lo:hi]}, index=range(lo, hi)))319 ddf = dd.concat(dfs).clear_divisions()320 assert ddf.set_index("timestamp").index.compute().is_monotonic is True321def test_set_index():322 dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 2, 6]},323 index=[0, 1, 3]),324 ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 5, 8]},325 index=[5, 6, 8]),326 ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [9, 1, 8]},327 index=[9, 9, 9])}328 d = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])329 full = d.compute()330 d2 = d.set_index('b', npartitions=3)331 assert d2.npartitions == 3332 assert d2.index.name == 'b'333 assert_eq(d2, full.set_index('b'))334 d3 = d.set_index(d.b, npartitions=3)335 assert d3.npartitions == 3336 assert d3.index.name == 'b'337 assert_eq(d3, full.set_index(full.b))338 d4 = d.set_index('b')339 assert d4.index.name == 'b'340 assert_eq(d4, full.set_index('b'))341def test_set_index_interpolate():342 df = pd.DataFrame({'x': [4, 1, 1, 3, 3], 'y': [1., 1, 1, 1, 2]})343 d = dd.from_pandas(df, 2)344 d1 = d.set_index('x', npartitions=3)345 assert d1.npartitions == 3346 assert set(d1.divisions) == set([1, 2, 3, 4])347 d2 = d.set_index('y', npartitions=3)348 assert d2.divisions[0] == 1.349 assert 1. < d2.divisions[1] < d2.divisions[2] < 2.350 assert d2.divisions[3] == 2.351def test_set_index_interpolate_int():352 L = sorted(list(range(0, 200, 10)) * 2)353 df = pd.DataFrame({'x': 2 * L})354 d = dd.from_pandas(df, 2)355 d1 = d.set_index('x', npartitions=10)356 assert all(np.issubdtype(type(x), np.integer) for x in d1.divisions)357def test_set_index_timezone():358 s_naive = pd.Series(pd.date_range('20130101', periods=3))359 s_aware = pd.Series(pd.date_range('20130101', periods=3, tz='US/Eastern'))360 df = pd.DataFrame({'tz': s_aware, 'notz': s_naive})361 d = dd.from_pandas(df, 2)362 d1 = d.set_index('notz', npartitions=2)363 s1 = pd.DatetimeIndex(s_naive.values, dtype=s_naive.dtype)364 assert d1.divisions[0] == s_naive[0] == s1[0]365 assert d1.divisions[-1] == s_naive[2] == s1[2]366 # We currently lose "freq". Converting data with pandas-defined dtypes367 # to numpy or pure Python can be lossy like this.368 d2 = d.set_index('tz', npartitions=2)369 s2 = pd.DatetimeIndex(s_aware, dtype=s_aware.dtype)370 assert d2.divisions[0] == s2[0]371 assert d2.divisions[-1] == s2[2]372 assert d2.divisions[0].tz == s2[0].tz373 assert d2.divisions[0].tz is not None374 s2badtype = pd.DatetimeIndex(s_aware.values, dtype=s_naive.dtype)375 with pytest.raises(TypeError):376 d2.divisions[0] == s2badtype[0]377@pytest.mark.parametrize('drop', [True, False])378def test_set_index_drop(drop):379 pdf = pd.DataFrame({'A': list('ABAABBABAA'),380 'B': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],381 'C': [1, 2, 3, 2, 1, 3, 2, 4, 2, 3]})382 ddf = dd.from_pandas(pdf, 3)383 assert_eq(ddf.set_index('A', drop=drop),384 pdf.set_index('A', drop=drop))385 assert_eq(ddf.set_index('B', drop=drop),386 pdf.set_index('B', drop=drop))387 assert_eq(ddf.set_index('C', drop=drop),388 pdf.set_index('C', drop=drop))389 assert_eq(ddf.set_index(ddf.A, drop=drop),390 pdf.set_index(pdf.A, drop=drop))391 assert_eq(ddf.set_index(ddf.B, drop=drop),392 pdf.set_index(pdf.B, drop=drop))393 assert_eq(ddf.set_index(ddf.C, drop=drop),394 pdf.set_index(pdf.C, drop=drop))395 # numeric columns396 pdf = pd.DataFrame({0: list('ABAABBABAA'),397 1: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],398 2: [1, 2, 3, 2, 1, 3, 2, 4, 2, 3]})399 ddf = dd.from_pandas(pdf, 3)400 assert_eq(ddf.set_index(0, drop=drop),401 pdf.set_index(0, drop=drop))402 assert_eq(ddf.set_index(2, drop=drop),403 pdf.set_index(2, drop=drop))404def test_set_index_raises_error_on_bad_input():405 df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],406 'b': [7, 6, 5, 4, 3, 2, 1]})407 ddf = dd.from_pandas(df, 2)408 msg = r"Dask dataframe does not yet support multi-indexes"409 with pytest.raises(NotImplementedError) as err:410 ddf.set_index(['a', 'b'])411 assert msg in str(err.value)412def test_set_index_sorted_true():413 df = pd.DataFrame({'x': [1, 2, 3, 4],414 'y': [10, 20, 30, 40],415 'z': [4, 3, 2, 1]})416 a = dd.from_pandas(df, 2, sort=False)417 assert not a.known_divisions418 b = a.set_index('x', sorted=True)419 assert b.known_divisions420 assert set(a.dask).issubset(set(b.dask))421 for drop in [True, False]:422 assert_eq(a.set_index('x', drop=drop),423 df.set_index('x', drop=drop))424 assert_eq(a.set_index(a.x, sorted=True, drop=drop),425 df.set_index(df.x, drop=drop))426 assert_eq(a.set_index(a.x + 1, sorted=True, drop=drop),427 df.set_index(df.x + 1, drop=drop))428 with pytest.raises(ValueError):429 a.set_index(a.z, sorted=True)430def test_set_index_sorted_single_partition():431 df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [1, 0, 1, 0]})432 ddf = dd.from_pandas(df, npartitions=1)433 assert_eq(ddf.set_index('x', sorted=True),434 df.set_index('x'))435def test_set_index_sorted_min_max_same():436 a = pd.DataFrame({'x': [1, 2, 3], 'y': [0, 0, 0]})437 b = pd.DataFrame({'x': [1, 2, 3], 'y': [1, 1, 1]})438 aa = delayed(a)439 bb = delayed(b)440 df = dd.from_delayed([aa, bb], meta=a)441 assert not df.known_divisions442 df2 = df.set_index('y', sorted=True)443 assert df2.divisions == (0, 1, 1)444def test_compute_divisions():445 from dask.dataframe.shuffle import compute_divisions446 df = pd.DataFrame({'x': [1, 2, 3, 4],447 'y': [10, 20, 30, 40],448 'z': [4, 3, 2, 1]},449 index=[1, 3, 10, 20])450 a = dd.from_pandas(df, 2, sort=False)451 assert not a.known_divisions452 divisions = compute_divisions(a)453 b = copy(a)454 b.divisions = divisions455 assert_eq(a, b, check_divisions=False)456 assert b.known_divisions457def test_temporary_directory(tmpdir):458 df = pd.DataFrame({'x': np.random.random(100),459 'y': np.random.random(100),460 'z': np.random.random(100)})461 ddf = dd.from_pandas(df, npartitions=10, name='x', sort=False)462 with dask.set_options(temporary_directory=str(tmpdir),463 get=dask.multiprocessing.get):464 ddf2 = ddf.set_index('x', shuffle='disk')465 ddf2.compute()466 assert any(fn.endswith('.partd') for fn in os.listdir(str(tmpdir)))467def test_empty_partitions():468 # See https://github.com/dask/dask/issues/2408469 df = pd.DataFrame({'a': list(range(10))})470 df['b'] = df['a'] % 3471 df['c'] = df['b'].astype(str)472 ddf = dd.from_pandas(df, npartitions=3)473 ddf = ddf.set_index('b')474 ddf = ddf.repartition(npartitions=3)475 ddf.get_partition(0).compute()476 assert_eq(ddf, df.set_index('b'))477 ddf = ddf.set_index('c')...
test_set_index.py
Source:test_set_index.py
...14 {"a": 4, "m": 12, "p": 21},15 ],16 columns=["a", "m", "p", "x"],17 )18 result = df.set_index(["a", "x"])19 expected = df[["m", "p"]]20 expected.index = MultiIndex.from_arrays([df["a"], df["x"]], names=["a", "x"])21 tm.assert_frame_equal(result, expected)22 def test_set_index_multiindexcolumns(self):23 columns = MultiIndex.from_tuples([("foo", 1), ("foo", 2), ("bar", 1)])24 df = DataFrame(np.random.randn(3, 3), columns=columns)25 result = df.set_index(df.columns[0])26 expected = df.iloc[:, 1:]27 expected.index = df.iloc[:, 0].values28 expected.index.names = [df.columns[0]]29 tm.assert_frame_equal(result, expected)30 def test_set_index_timezone(self):31 # GH#1235832 # tz-aware Series should retain the tz33 idx = DatetimeIndex(["2014-01-01 10:10:10"], tz="UTC").tz_convert("Europe/Rome")34 df = DataFrame({"A": idx})35 assert df.set_index(idx).index[0].hour == 1136 assert DatetimeIndex(Series(df.A))[0].hour == 1137 assert df.set_index(df.A).index[0].hour == 1138 def test_set_index_cast_datetimeindex(self):39 df = DataFrame(40 {41 "A": [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)],42 "B": np.random.randn(1000),43 }44 )45 idf = df.set_index("A")46 assert isinstance(idf.index, DatetimeIndex)47 def test_set_index_dst(self):48 di = date_range("2006-10-29 00:00:00", periods=3, freq="H", tz="US/Pacific")49 df = DataFrame(data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=di).reset_index()50 # single level51 res = df.set_index("index")52 exp = DataFrame(53 data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=Index(di, name="index")54 )55 tm.assert_frame_equal(res, exp)56 # GH#1292057 res = df.set_index(["index", "a"])58 exp_index = MultiIndex.from_arrays([di, [0, 1, 2]], names=["index", "a"])59 exp = DataFrame({"b": [3, 4, 5]}, index=exp_index)60 tm.assert_frame_equal(res, exp)61 def test_set_index(self, float_string_frame):62 df = float_string_frame63 idx = Index(np.arange(len(df))[::-1])64 df = df.set_index(idx)65 tm.assert_index_equal(df.index, idx)66 with pytest.raises(ValueError, match="Length mismatch"):67 df.set_index(idx[::2])68 def test_set_index_names(self):69 df = tm.makeDataFrame()70 df.index.name = "name"71 assert df.set_index(df.index).index.names == ["name"]72 mi = MultiIndex.from_arrays(df[["A", "B"]].T.values, names=["A", "B"])73 mi2 = MultiIndex.from_arrays(74 df[["A", "B", "A", "B"]].T.values, names=["A", "B", "C", "D"]75 )76 df = df.set_index(["A", "B"])77 assert df.set_index(df.index).index.names == ["A", "B"]78 # Check that set_index isn't converting a MultiIndex into an Index79 assert isinstance(df.set_index(df.index).index, MultiIndex)80 # Check actual equality81 tm.assert_index_equal(df.set_index(df.index).index, mi)82 idx2 = df.index.rename(["C", "D"])83 # Check that [MultiIndex, MultiIndex] yields a MultiIndex rather84 # than a pair of tuples85 assert isinstance(df.set_index([df.index, idx2]).index, MultiIndex)86 # Check equality87 tm.assert_index_equal(df.set_index([df.index, idx2]).index, mi2)88 def test_set_index_cast(self):89 # issue casting an index then set_index90 df = DataFrame(91 {"A": [1.1, 2.2, 3.3], "B": [5.0, 6.1, 7.2]}, index=[2010, 2011, 2012]92 )93 df2 = df.set_index(df.index.astype(np.int32))94 tm.assert_frame_equal(df, df2)95 # A has duplicate values, C does not96 @pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")])97 @pytest.mark.parametrize("inplace", [True, False])98 @pytest.mark.parametrize("drop", [True, False])99 def test_set_index_drop_inplace(self, frame_of_index_cols, drop, inplace, keys):100 df = frame_of_index_cols101 if isinstance(keys, list):102 idx = MultiIndex.from_arrays([df[x] for x in keys], names=keys)103 else:104 idx = Index(df[keys], name=keys)105 expected = df.drop(keys, axis=1) if drop else df106 expected.index = idx107 if inplace:108 result = df.copy()109 return_value = result.set_index(keys, drop=drop, inplace=True)110 assert return_value is None111 else:112 result = df.set_index(keys, drop=drop)113 tm.assert_frame_equal(result, expected)114 # A has duplicate values, C does not115 @pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")])116 @pytest.mark.parametrize("drop", [True, False])117 def test_set_index_append(self, frame_of_index_cols, drop, keys):118 df = frame_of_index_cols119 keys = keys if isinstance(keys, list) else [keys]120 idx = MultiIndex.from_arrays(121 [df.index] + [df[x] for x in keys], names=[None] + keys122 )123 expected = df.drop(keys, axis=1) if drop else df.copy()124 expected.index = idx125 result = df.set_index(keys, drop=drop, append=True)126 tm.assert_frame_equal(result, expected)127 # A has duplicate values, C does not128 @pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")])129 @pytest.mark.parametrize("drop", [True, False])130 def test_set_index_append_to_multiindex(self, frame_of_index_cols, drop, keys):131 # append to existing multiindex132 df = frame_of_index_cols.set_index(["D"], drop=drop, append=True)133 keys = keys if isinstance(keys, list) else [keys]134 expected = frame_of_index_cols.set_index(["D"] + keys, drop=drop, append=True)135 result = df.set_index(keys, drop=drop, append=True)136 tm.assert_frame_equal(result, expected)137 def test_set_index_after_mutation(self):138 # GH#1590139 df = DataFrame({"val": [0, 1, 2], "key": ["a", "b", "c"]})140 expected = DataFrame({"val": [1, 2]}, Index(["b", "c"], name="key"))141 df2 = df.loc[df.index.map(lambda indx: indx >= 1)]142 result = df2.set_index("key")143 tm.assert_frame_equal(result, expected)144 # MultiIndex constructor does not work directly on Series -> lambda145 # Add list-of-list constructor because list is ambiguous -> lambda146 # also test index name if append=True (name is duplicate here for B)147 @pytest.mark.parametrize(148 "box",149 [150 Series,151 Index,152 np.array,153 list,154 lambda x: [list(x)],155 lambda x: MultiIndex.from_arrays([x]),156 ],157 )158 @pytest.mark.parametrize(159 "append, index_name", [(True, None), (True, "B"), (True, "test"), (False, None)]160 )161 @pytest.mark.parametrize("drop", [True, False])162 def test_set_index_pass_single_array(163 self, frame_of_index_cols, drop, append, index_name, box164 ):165 df = frame_of_index_cols166 df.index.name = index_name167 key = box(df["B"])168 if box == list:169 # list of strings gets interpreted as list of keys170 msg = "['one', 'two', 'three', 'one', 'two']"171 with pytest.raises(KeyError, match=msg):172 df.set_index(key, drop=drop, append=append)173 else:174 # np.array/list-of-list "forget" the name of B175 name_mi = getattr(key, "names", None)176 name = [getattr(key, "name", None)] if name_mi is None else name_mi177 result = df.set_index(key, drop=drop, append=append)178 # only valid column keys are dropped179 # since B is always passed as array above, nothing is dropped180 expected = df.set_index(["B"], drop=False, append=append)181 expected.index.names = [index_name] + name if append else name182 tm.assert_frame_equal(result, expected)183 # MultiIndex constructor does not work directly on Series -> lambda184 # also test index name if append=True (name is duplicate here for A & B)185 @pytest.mark.parametrize(186 "box", [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x])]187 )188 @pytest.mark.parametrize(189 "append, index_name",190 [(True, None), (True, "A"), (True, "B"), (True, "test"), (False, None)],191 )192 @pytest.mark.parametrize("drop", [True, False])193 def test_set_index_pass_arrays(194 self, frame_of_index_cols, drop, append, index_name, box195 ):196 df = frame_of_index_cols197 df.index.name = index_name198 keys = ["A", box(df["B"])]199 # np.array/list "forget" the name of B200 names = ["A", None if box in [np.array, list, tuple, iter] else "B"]201 result = df.set_index(keys, drop=drop, append=append)202 # only valid column keys are dropped203 # since B is always passed as array above, only A is dropped, if at all204 expected = df.set_index(["A", "B"], drop=False, append=append)205 expected = expected.drop("A", axis=1) if drop else expected206 expected.index.names = [index_name] + names if append else names207 tm.assert_frame_equal(result, expected)208 # MultiIndex constructor does not work directly on Series -> lambda209 # We also emulate a "constructor" for the label -> lambda210 # also test index name if append=True (name is duplicate here for A)211 @pytest.mark.parametrize(212 "box2",213 [214 Series,215 Index,216 np.array,217 list,218 iter,219 lambda x: MultiIndex.from_arrays([x]),220 lambda x: x.name,221 ],222 )223 @pytest.mark.parametrize(224 "box1",225 [226 Series,227 Index,228 np.array,229 list,230 iter,231 lambda x: MultiIndex.from_arrays([x]),232 lambda x: x.name,233 ],234 )235 @pytest.mark.parametrize(236 "append, index_name", [(True, None), (True, "A"), (True, "test"), (False, None)]237 )238 @pytest.mark.parametrize("drop", [True, False])239 def test_set_index_pass_arrays_duplicate(240 self, frame_of_index_cols, drop, append, index_name, box1, box2241 ):242 df = frame_of_index_cols243 df.index.name = index_name244 keys = [box1(df["A"]), box2(df["A"])]245 result = df.set_index(keys, drop=drop, append=append)246 # if either box is iter, it has been consumed; re-read247 keys = [box1(df["A"]), box2(df["A"])]248 # need to adapt first drop for case that both keys are 'A' --249 # cannot drop the same column twice;250 # plain == would give ambiguous Boolean error for containers251 first_drop = (252 False253 if (254 isinstance(keys[0], str)255 and keys[0] == "A"256 and isinstance(keys[1], str)257 and keys[1] == "A"258 )259 else drop260 )261 # to test against already-tested behaviour, we add sequentially,262 # hence second append always True; must wrap keys in list, otherwise263 # box = list would be interpreted as keys264 expected = df.set_index([keys[0]], drop=first_drop, append=append)265 expected = expected.set_index([keys[1]], drop=drop, append=True)266 tm.assert_frame_equal(result, expected)267 @pytest.mark.parametrize("append", [True, False])268 @pytest.mark.parametrize("drop", [True, False])269 def test_set_index_pass_multiindex(self, frame_of_index_cols, drop, append):270 df = frame_of_index_cols271 keys = MultiIndex.from_arrays([df["A"], df["B"]], names=["A", "B"])272 result = df.set_index(keys, drop=drop, append=append)273 # setting with a MultiIndex will never drop columns274 expected = df.set_index(["A", "B"], drop=False, append=append)275 tm.assert_frame_equal(result, expected)276 def test_construction_with_categorical_index(self):277 ci = tm.makeCategoricalIndex(10)278 ci.name = "B"279 # with Categorical280 df = DataFrame({"A": np.random.randn(10), "B": ci.values})281 idf = df.set_index("B")282 tm.assert_index_equal(idf.index, ci)283 # from a CategoricalIndex284 df = DataFrame({"A": np.random.randn(10), "B": ci})285 idf = df.set_index("B")286 tm.assert_index_equal(idf.index, ci)287 # round-trip288 idf = idf.reset_index().set_index("B")289 tm.assert_index_equal(idf.index, ci)290class TestSetIndexInvalid:291 def test_set_index_verify_integrity(self, frame_of_index_cols):292 df = frame_of_index_cols293 with pytest.raises(ValueError, match="Index has duplicate keys"):294 df.set_index("A", verify_integrity=True)295 # with MultiIndex296 with pytest.raises(ValueError, match="Index has duplicate keys"):297 df.set_index([df["A"], df["A"]], verify_integrity=True)298 @pytest.mark.parametrize("append", [True, False])299 @pytest.mark.parametrize("drop", [True, False])300 def test_set_index_raise_keys(self, frame_of_index_cols, drop, append):301 df = frame_of_index_cols302 with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"):303 # column names are A-E, as well as one tuple304 df.set_index(["foo", "bar", "baz"], drop=drop, append=append)305 # non-existent key in list with arrays306 with pytest.raises(KeyError, match="X"):307 df.set_index([df["A"], df["B"], "X"], drop=drop, append=append)308 msg = "[('foo', 'foo', 'foo', 'bar', 'bar')]"309 # tuples always raise KeyError310 with pytest.raises(KeyError, match=msg):311 df.set_index(tuple(df["A"]), drop=drop, append=append)312 # also within a list313 with pytest.raises(KeyError, match=msg):314 df.set_index(["A", df["A"], tuple(df["A"])], drop=drop, append=append)315 @pytest.mark.parametrize("append", [True, False])316 @pytest.mark.parametrize("drop", [True, False])317 @pytest.mark.parametrize("box", [set], ids=["set"])318 def test_set_index_raise_on_type(self, frame_of_index_cols, box, drop, append):319 df = frame_of_index_cols320 msg = 'The parameter "keys" may be a column key, .*'321 # forbidden type, e.g. set322 with pytest.raises(TypeError, match=msg):323 df.set_index(box(df["A"]), drop=drop, append=append)324 # forbidden type in list, e.g. set325 with pytest.raises(TypeError, match=msg):326 df.set_index(["A", df["A"], box(df["A"])], drop=drop, append=append)327 # MultiIndex constructor does not work directly on Series -> lambda328 @pytest.mark.parametrize(329 "box",330 [Series, Index, np.array, iter, lambda x: MultiIndex.from_arrays([x])],331 ids=["Series", "Index", "np.array", "iter", "MultiIndex"],332 )333 @pytest.mark.parametrize("length", [4, 6], ids=["too_short", "too_long"])334 @pytest.mark.parametrize("append", [True, False])335 @pytest.mark.parametrize("drop", [True, False])336 def test_set_index_raise_on_len(337 self, frame_of_index_cols, box, length, drop, append338 ):339 # GH 24984340 df = frame_of_index_cols # has length 5341 values = np.random.randint(0, 10, (length,))342 msg = "Length mismatch: Expected 5 rows, received array of length.*"343 # wrong length directly344 with pytest.raises(ValueError, match=msg):345 df.set_index(box(values), drop=drop, append=append)346 # wrong length in list347 with pytest.raises(ValueError, match=msg):348 df.set_index(["A", df.A, box(values)], drop=drop, append=append)349class TestSetIndexCustomLabelType:350 def test_set_index_custom_label_type(self):351 # GH#24969352 class Thing:353 def __init__(self, name, color):354 self.name = name355 self.color = color356 def __str__(self) -> str:357 return f"<Thing {repr(self.name)}>"358 # necessary for pretty KeyError359 __repr__ = __str__360 thing1 = Thing("One", "red")361 thing2 = Thing("Two", "blue")362 df = DataFrame({thing1: [0, 1], thing2: [2, 3]})363 expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2))364 # use custom label directly365 result = df.set_index(thing2)366 tm.assert_frame_equal(result, expected)367 # custom label wrapped in list368 result = df.set_index([thing2])369 tm.assert_frame_equal(result, expected)370 # missing key371 thing3 = Thing("Three", "pink")372 msg = "<Thing 'Three'>"373 with pytest.raises(KeyError, match=msg):374 # missing label directly375 df.set_index(thing3)376 with pytest.raises(KeyError, match=msg):377 # missing label in list378 df.set_index([thing3])379 def test_set_index_custom_label_hashable_iterable(self):380 # GH#24969381 # actual example discussed in GH 24984 was e.g. for shapely.geometry382 # objects (e.g. a collection of Points) that can be both hashable and383 # iterable; using frozenset as a stand-in for testing here384 class Thing(frozenset):385 # need to stabilize repr for KeyError (due to random order in sets)386 def __repr__(self) -> str:387 tmp = sorted(self)388 joined_reprs = ", ".join(map(repr, tmp))389 # double curly brace prints one brace in format string390 return f"frozenset({{{joined_reprs}}})"391 thing1 = Thing(["One", "red"])392 thing2 = Thing(["Two", "blue"])393 df = DataFrame({thing1: [0, 1], thing2: [2, 3]})394 expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2))395 # use custom label directly396 result = df.set_index(thing2)397 tm.assert_frame_equal(result, expected)398 # custom label wrapped in list399 result = df.set_index([thing2])400 tm.assert_frame_equal(result, expected)401 # missing key402 thing3 = Thing(["Three", "pink"])403 msg = r"frozenset\(\{'Three', 'pink'\}\)"404 with pytest.raises(KeyError, match=msg):405 # missing label directly406 df.set_index(thing3)407 with pytest.raises(KeyError, match=msg):408 # missing label in list409 df.set_index([thing3])410 def test_set_index_custom_label_type_raises(self):411 # GH#24969412 # purposefully inherit from something unhashable413 class Thing(set):414 def __init__(self, name, color):415 self.name = name416 self.color = color417 def __str__(self) -> str:418 return f"<Thing {repr(self.name)}>"419 thing1 = Thing("One", "red")420 thing2 = Thing("Two", "blue")421 df = DataFrame([[0, 2], [1, 3]], columns=[thing1, thing2])422 msg = 'The parameter "keys" may be a column key, .*'423 with pytest.raises(TypeError, match=msg):424 # use custom label directly425 df.set_index(thing2)426 with pytest.raises(TypeError, match=msg):427 # custom label wrapped in list...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!