baseline = df.memory_usage(deep=True).sum() / 1024**2 print(f"baseline: {baseline:.2f} MB")
int_cols = df.select_dtypes(include=["int64"]).columns float_cols = df.select_dtypes(include=["float64"]).columns df[int_cols] = df[int_cols].apply(pd.to_numeric, downcast="integer") df[float_cols] = df[float_cols].apply(pd.to_numeric, downcast="float")
for c in ["region", "device", "segment"]: df[c] = df[c].astype("category")
category
chunks = pd.read_csv("events.csv", chunksize=200_000) agg = [] for chunk in chunks: part = chunk.groupby("event_type")["amount"].sum() agg.append(part) result = pd.concat(agg).groupby(level=0).sum()