Loading...
Loading...
Expert in statistical analysis, predictive modeling, machine learning, and data storytelling to drive business insights.
npx skill4agent add 404kidwiz/claude-supercode-skills data-scientistimport pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Load data
df = pd.read_csv("customer_data.csv")
# Basic profiling
print(df.info())
print(df.describe())
# Missing values analysis
missing = df.isnull().sum() / len(df)
print(missing[missing > 0].sort_values(ascending=False))# Numerical features
num_cols = df.select_dtypes(include=[np.number]).columns
for col in num_cols:
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
sns.histplot(df[col], kde=True)
plt.subplot(1, 2, 2)
sns.boxplot(x=df[col])
plt.show()
# Categorical features
cat_cols = df.select_dtypes(exclude=[np.number]).columns
for col in cat_cols:
print(df[col].value_counts(normalize=True))# Correlation matrix
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
# Target vs Features
target = 'churn'
sns.boxplot(x=target, y='tenure', data=df)# Impute missing values
df['age'].fillna(df['age'].median(), inplace=True)
df['category'].fillna('Unknown', inplace=True)
# Handle outliers (Example: Cap at 99th percentile)
cap = df['income'].quantile(0.99)
df['income'] = np.where(df['income'] > cap, cap, df['income'])# data: ['user_id', 'group', 'converted']
results = df.groupby('group')['converted'].agg(['count', 'sum', 'mean'])
results.columns = ['n_users', 'conversions', 'conversion_rate']
print(results)from statsmodels.stats.proportion import proportions_ztest
control = results.loc['A']
treatment = results.loc['B']
count = np.array([treatment['conversions'], control['conversions']])
nobs = np.array([treatment['n_users'], control['n_users']])
stat, p_value = proportions_ztest(count, nobs, alternative='larger')
print(f"Z-statistic: {stat:.4f}")
print(f"P-value: {p_value:.4f}")from statsmodels.stats.proportion import proportion_confint
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(count, nobs, alpha=0.05)
print(f"Control CI: [{lower_con:.4f}, {upper_con:.4f}]")
print(f"Treatment CI: [{lower_treat:.4f}, {upper_treat:.4f}]")from sklearn.linear_model import LogisticRegression
# P(Treatment=1 | Confounders)
confounders = ['age', 'income', 'tenure']
logit = LogisticRegression()
logit.fit(df[confounders], df['is_premium'])
df['propensity_score'] = logit.predict_proba(df[confounders])[:, 1]
# Check overlap (Common Support)
sns.histplot(data=df, x='propensity_score', hue='is_premium', element='step')from sklearn.neighbors import NearestNeighbors
# Separate groups
treatment = df[df['is_premium'] == 1]
control = df[df['is_premium'] == 0]
# Find neighbors for treatment group in control group
nn = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
nn.fit(control[['propensity_score']])
distances, indices = nn.kneighbors(treatment[['propensity_score']])
# Create matched dataframe
matched_control = control.iloc[indices.flatten()]
# Compare outcomes
ate = treatment['spend'].mean() - matched_control['spend'].mean()
print(f"Average Treatment Effect (ATE): ${ate:.2f}")abs(mean_diff) / pooled_std < 0.1X_trainX_testPipelinescale_pos_weightclass_weight='balanced'requirements.txtenvironment.ymlrandom_state=42# Bootstrap confidence interval for difference in means
from scipy import stats
diff = treatment_means - control_means
ci = np.percentile(bootstrap_diffs, [2.5, 97.5])| Model | MAPE | 90% CI Width |
|---|---|---|
| ARIMA | 12.3% | ±15% |
| Prophet | 9.8% | ±12% |
| XGBoost | 7.2% | ±9% |