Loading...
Loading...
This skill should be used when the user asks to "use pandas", "analyze data with pandas", "work with DataFrames", "clean data with pandas", or needs guidance on pandas best practices, data manipulation, performance optimization, or common pandas patterns.
npx skill4agent add the-perfect-developer/the-perfect-opencode pandasDataFrameSeriesimport pandas as pd
import numpy as np
# Create DataFrame from dict
df = pd.DataFrame({
"name": ["Alice", "Bob", "Charlie"],
"score": [95, 82, 78],
"grade": pd.Categorical(["A", "B", "C"]),
})
# Create Series
s = pd.Series([10, 20, 30], index=["a", "b", "c"], name="values").loc.iloc# Label-based selection
df.loc[0, "name"] # single value
df.loc[:, ["name", "score"]] # multiple columns
df.loc[df["score"] > 80] # boolean mask
# Position-based selection
df.iloc[0, 1] # row 0, col 1
df.iloc[1:3, :] # rows 1-2, all columnsdf["col"][condition]ChainedAssignmentError.loc# Wrong — chained assignment, silently fails or raises in pandas 3.0
df["score"][df["name"] == "Bob"] = 90
# Correct
df.loc[df["name"] == "Bob", "score"] = 90# Triggers a copy on every subsequent mutation (wasteful)
df2 = df.reset_index(drop=True)
df2.iloc[0, 0] = 100 # triggers copy because df and df2 share data
# Reassign to same variable — no extra copy needed
df = df.reset_index(drop=True)
df.iloc[0, 0] = 100 # operates in-place, no shared data| Data | Default dtype | Optimal dtype |
|---|---|---|
| Low-cardinality strings | | |
| Large integers | | |
| Booleans with NA | | |
| Floats | | |
# Convert low-cardinality string column to Categorical (~10x memory reduction)
df["status"] = df["status"].astype("category")
# Downcast numerics
df["id"] = pd.to_numeric(df["id"], downcast="unsigned")
df["value"] = pd.to_numeric(df["value"], downcast="float")
# Check memory usage
df.memory_usage(deep=True)pd.Categorical# Wrong — Python loop, slow
results = []
for _, row in df.iterrows():
results.append(row["x"] * row["y"])
# Correct — vectorized
df["product"] = df["x"] * df["y"]
# Use .str accessor for string operations (vectorized)
df["name_upper"] = df["name"].str.upper()
df["name_len"] = df["name"].str.len()
# Use .dt accessor for datetime operations
df["year"] = df["timestamp"].dt.year
df["month"] = df["timestamp"].dt.monthdf.apply(func, axis=1)raw=Truedf.apply(func, axis=1)itertuples()iterrows()NaNpd.NaTpd.NA# Detect missing
df.isna() # boolean mask
df["col"].notna() # inverse mask
# Fill missing values
df["score"].fillna(0)
df["score"].fillna(df["score"].median())
# Drop rows with missing values
df.dropna(subset=["score", "name"])
# Use nullable integer (preserves int type with NAs)
s = pd.array([1, 2, None], dtype="Int64") # capital I = nullableInt64Float64booleanstringfloat64# Aggregation — returns one row per group
df.groupby("grade")["score"].mean()
df.groupby("grade").agg({"score": ["mean", "std", "count"]})
# Named aggregation (pandas 0.25+)
df.groupby("grade").agg(
avg_score=("score", "mean"),
count=("name", "count"),
)
# Transformation — returns same shape as input
df["score_zscore"] = df.groupby("grade")["score"].transform(
lambda s: (s - s.mean()) / s.std()
)
# Filter — keep groups matching a condition
df.groupby("grade").filter(lambda g: len(g) >= 2)applytransformtransform# Merge (SQL-style join)
result = pd.merge(left, right, on="key", how="left")
result = pd.merge(left, right, left_on="id", right_on="user_id", how="inner")
# Concat along rows
pd.concat([df1, df2], ignore_index=True)
# Pivot — long to wide
pivot = df.pivot_table(index="date", columns="category", values="amount", aggfunc="sum")
# Melt — wide to long
melted = df.melt(id_vars=["id"], value_vars=["q1", "q2", "q3"], var_name="quarter")ignore_index=Truepd.concat| Format | Read | Write | Notes |
|---|---|---|---|
| CSV | | | Use |
| Parquet | | | Preferred for large datasets |
| JSON | | | Use |
| Excel | | | Slow; avoid for large files |
| SQL | | | Use |
# Specify dtypes at read time — avoids post-hoc conversion cost
df = pd.read_csv(
"data.csv",
usecols=["id", "name", "amount"],
dtype={"id": "int32", "name": "category"},
parse_dates=["created_at"],
)
# Parquet with column selection — reads only needed columns from disk
df = pd.read_parquet("data.parquet", columns=["id", "amount"])# Use raw=True for numeric operations — passes NumPy array, not Series
df["result"] = df["value"].rolling(10).apply(np.sum, raw=True)
# Use engine="numba" for large datasets (>1M rows)
df["result"] = df["value"].rolling(10).apply(my_func, raw=True, engine="numba")
# Prefer map over apply for element-wise Series operations
df["label"] = df["code"].map({1: "low", 2: "mid", 3: "high"})applyaxis=1| Task | Method |
|---|---|
| Shape | |
| Column dtypes | |
| Summary statistics | |
| Unique values | |
| Value counts | |
| Sort | |
| Rename columns | |
| Drop columns | |
| Reset index | |
| Filter rows | |
| Select by dtype | |
| Duplicate detection | |
| Apply function | |
references/data-types-and-memory.mdreferences/performance-and-scaling.md