pandas tricks(pandas tricks)

# Check for equality# 创建DataFramedf = pd.DataFrame({‘a’:[10, 40, np.nan], ‘b’:[10, 40, np.nan]})print(‘data:\n’, df)print()# 查看a列与b列是否相同print(‘df.a == df.b:’)print(df.a == df.b)print()# 查看两个空值是否相同,返回Falseprint(‘np.nan == np.nan:’)print(np.nan == np.nan) # 可以用equals()方法print()print(‘df.a.equals(df.b):’)print(df.a.equals(df.b))print()# 可以使用assert_series_equal函数print(‘pd.testing.assert_series_equal(df.a, df.b, check_names=False, check_dtype=False):’)print(pd.testing.assert_series_equal(df.a, df.b, check_names=False, check_dtype=False))print()# assert_frame_equal函数查看是否相同,异常则输出df_new = df.copy()pd.testing.assert_frame_equal(df, df_new)# Use NumPy without importing NumPypd.np.random.seed(0)d1 = pd.DataFrame(pd.np.random.rand(2, 4))print(‘d1:\n’, d1)d1.loc[0,0] = pd.np.nanprint(‘d1:\n’, d1)# Calculate memory usagedf.info(memory_usage=’deep’)# calculate memory used by each columndf.memory_usage(deep=True) # Convert one set of values to anotherdf[‘c’] = df.a.factorize()[0] print(df) df = pd.DataFrame([[12, 25, 2019, ‘christmas’], [11, 28, 2019, ‘thanksgiving’]], columns=[‘month’, ‘day’, ‘year’, ‘holiday’])print(df)df[‘date’] = pd.to_datetime(df[[‘month’, ‘day’, ‘year’]])print(df)# Create an example DataFramepd.util.testing.makeDataFrame().head()pd.util.testing.makeMissingDataframe().head()df = pd.util.testing.makeTimeDataFrame().head()df.resample(‘M’).A.mean() df = pd.util.testing.makeTimeDataFrame().head()df.reset_index(inplace=True)df.resample(‘D’, on=’index’).A.mean() # 保持CSV可以压缩df.to_csv(‘dataframe.csv.zip’)df.to_csv(‘dataframe.csv.gz’)df.to_csv(‘dataframe.csv.bz2’)df.to_csv(‘dataframe.csv.xz’)# Fill missing values using interpolationdf = pd.DataFrame({‘a’:[100, 120, 130, np.nan, 140], ‘b’:[9, 9, np.nan, 7.5, 6.5]})df.index = pd.to_datetime([‘2019-01’, ‘2019-02’, ‘2019-03’, ‘2019-04’, ‘2019-05’])dfdf.interpolate()# Check for duplicate merge keysleft = pd.DataFrame({‘color’: [‘green’, ‘yellow’, ‘red’], ‘num’:[1, 2, 3]})leftright = pd.DataFrame({‘color’: [‘green’, ‘yellow’, ‘pink’, ‘green’], ‘size’:[‘S’, ‘M’, ‘L’, ‘XL’]})rightpd.merge(left, right, how=’inner’, validate=’one_to_many’)# 创建其他数据集方法[x for x in dir(pd.util.testing) if x.startswith(‘make’)]

————————

# Check for equality# 创建DataFramedf = pd.DataFrame({‘a’:[10, 40, np.nan], ‘b’:[10, 40, np.nan]})print(‘data:\n’, df)print()# 查看a列与b列是否相同print(‘df.a == df.b:’)print(df.a == df.b)print()# 查看两个空值是否相同,返回Falseprint(‘np.nan == np.nan:’)print(np.nan == np.nan) # 可以用equals()方法print()print(‘df.a.equals(df.b):’)print(df.a.equals(df.b))print()# 可以使用assert_series_equal函数print(‘pd.testing.assert_series_equal(df.a, df.b, check_names=False, check_dtype=False):’)print(pd.testing.assert_series_equal(df.a, df.b, check_names=False, check_dtype=False))print()# assert_frame_equal函数查看是否相同,异常则输出df_new = df.copy()pd.testing.assert_frame_equal(df, df_new)# Use NumPy without importing NumPypd.np.random.seed(0)d1 = pd.DataFrame(pd.np.random.rand(2, 4))print(‘d1:\n’, d1)d1.loc[0,0] = pd.np.nanprint(‘d1:\n’, d1)# Calculate memory usagedf.info(memory_usage=’deep’)# calculate memory used by each columndf.memory_usage(deep=True) # Convert one set of values to anotherdf[‘c’] = df.a.factorize()[0] print(df) df = pd.DataFrame([[12, 25, 2019, ‘christmas’], [11, 28, 2019, ‘thanksgiving’]], columns=[‘month’, ‘day’, ‘year’, ‘holiday’])print(df)df[‘date’] = pd.to_datetime(df[[‘month’, ‘day’, ‘year’]])print(df)# Create an example DataFramepd.util.testing.makeDataFrame().head()pd.util.testing.makeMissingDataframe().head()df = pd.util.testing.makeTimeDataFrame().head()df.resample(‘M’).A.mean() df = pd.util.testing.makeTimeDataFrame().head()df.reset_index(inplace=True)df.resample(‘D’, on=’index’).A.mean() # 保持CSV可以压缩df.to_csv(‘dataframe.csv.zip’)df.to_csv(‘dataframe.csv.gz’)df.to_csv(‘dataframe.csv.bz2’)df.to_csv(‘dataframe.csv.xz’)# Fill missing values using interpolationdf = pd.DataFrame({‘a’:[100, 120, 130, np.nan, 140], ‘b’:[9, 9, np.nan, 7.5, 6.5]})df.index = pd.to_datetime([‘2019-01’, ‘2019-02’, ‘2019-03’, ‘2019-04’, ‘2019-05’])dfdf.interpolate()# Check for duplicate merge keysleft = pd.DataFrame({‘color’: [‘green’, ‘yellow’, ‘red’], ‘num’:[1, 2, 3]})leftright = pd.DataFrame({‘color’: [‘green’, ‘yellow’, ‘pink’, ‘green’], ‘size’:[‘S’, ‘M’, ‘L’, ‘XL’]})rightpd.merge(left, right, how=’inner’, validate=’one_to_many’)# 创建其他数据集方法[x for x in dir(pd.util.testing) if x.startswith(‘make’)]