Recipes for pandas
Vectorized vs. Pure Python
- In general, do not use for loops with pandas or numpy
- For large N, performance is ~100x slower with python for loop vs. pandas or numpy methods
- See also benchmarks for comparisons with Julia
import pandas as pd
import numpy as np
import perfplot
def make_series(n):
return pd.Series(np.random.randint(10, size=n))
def sum_python(x):
return sum(e for e in x)
def sum_pandas(x):
return x.sum()
perfplot.show(
setup=make_series,
kernels=[sum_python, sum_pandas],
n_range=[2**k for k in range(2, 25)],
logx=True,
logy=True,
equality_check=False,
xlabel='len(series)'
)
Query styles and performance
- For breakdown, see: https://stackoverflow.com/a/49937318
- When using pandas data structures, use boolean indexing, e.g.
x[x > k]
- For maximum performance, use numexpr on numpy arrays directly, e.g.
x[numexpr.evaluate('x > k')]
import numexpr
import pandas as pd
import numpy as np
import perfplot
np.random.seed(1337)
def numexpr_nparray(x):
x = x.A.values
return x[numexpr.evaluate('(x > 5)')]
def numexpr_pandas(x):
return x[numexpr.evaluate('(x > 5)')]
def np_boolean_mask(x):
return x[x.A.values > 5]
def query(x):
return x.query('A > 5')
def pandas_eval(x):
return x[x.eval('A > 5')]
def pdseries_boolean_mask(x):
return x[x.A > 5]
def make_df(n):
df = pd.DataFrame(np.random.randint(10, size=n), columns=['A'])
return df
perfplot.show(
setup=make_df,
kernels=[numexpr_nparray, numexpr_pandas, np_boolean_mask, query, pandas_eval, pdseries_boolean_mask],
n_range=[2**k for k in range(2, 25)],
logx=True,
logy=True,
equality_check=False,
xlabel='len(df)')
df = pd.read_csv('https://raw.githubusercontent.com/tidyverse/ggplot2/master/data-raw/diamonds.csv')
df.head(3)
categories = {
'cut': {'ordered': True, 'categories': ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']},
'color': {'ordered': True, 'categories': ['J', 'I', 'H', 'G', 'F', 'E', 'D']},
'clarity': {'ordered': True, 'categories': ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']},
}
df2 = categorize(df, categories)
pd.DataFrame({'old_dtypes': df.dtypes, 'new_dtypes': df2.dtypes})
Method chaining
- Used to create a declarative (English-readable) set of operations happening on the dataframe
- Abstract away complexity from the reader, but have it available if they want to see
- Subjective: Looks cleaner to my eyes
- Drawbacks: Can make debugging difficult if you are not familiar with how it works.
- Method chaining syntax (
.pipe
) may be familiar if coming from JS (.then
), Bash (|
), or R (%>%
)df = ( pd.read_parquet('my_file.parquet') .pipe(clean_data) .pipe(generate_features) )
@pd.api.extensions.register_dataframe_accessor("top_level")
class TopLevelAccessor:
def __init__(self, df):
self._df = df
self.level_a = LevelA(df)
self.level_b = LevelB(df)
def do_something(self): pass
class LevelA:
def __init__(self, df):
self._df = df
def do_something_else(self): pass
class LevelB:
def __init__(self, df):
self._df = df
df.top_level.do_something()
df.top_level.level_a.do_something_else()