Loading...
Loading...
Exploratory Data Analysis skill for CSV and parquet datasets with deterministic profiling, drift/anomaly scans, contract generation and validation, and optional memory writeback into skill-system-memory. The implementation is Polars-first (lazy scan for large files and early `--sample` head), includes high-cardinality guards for profile/importance/contract flows, and supports categorical correlation with Cramer's V. Use when building or reviewing tabular fraud/risk/data-quality workflows, profiling new datasets, checking leakage or drift, or saving/validating data contracts.
npx skill4agent add arthur0824hao/skills skill-system-edascripts/eda.pypython3 scripts/eda.py profile-dataset --input data.csv --target Class --output /tmp/eda
python3 scripts/eda.py distribution-report --input data.csv --target Class --profile /tmp/eda/profile.yaml
python3 scripts/eda.py correlation-matrix --input data.csv --target Class --profile /tmp/eda/profile.yaml
python3 scripts/eda.py anomaly-profiling --input data.csv --target Class --profile /tmp/eda/profile.yaml
python3 scripts/eda.py feature-importance-scan --input data.csv --target Class --profile /tmp/eda/profile.yaml
python3 scripts/eda.py leakage-detector --input data.csv --target Class --profile /tmp/eda/profile.yaml
python3 scripts/eda.py save-contract --profile /tmp/eda/profile.yaml --output /tmp/eda/contract.yaml
python3 scripts/eda.py validate-contract --input new_data.csv --contract /tmp/eda/contract.yamlprofile-datasetprofile.yamlreport.mdprofile.yamlreport.mdsave-contractcontract.yamlvalidate-contractPASSFAIL.head(N)--sampleprofile.yamlreport.mdscan_csvscan_parquet>50>100>50%skill-system-memory/scripts/mem.py storeEDA_DISABLE_MEM_PY=1.memory/pending/--no-memorysave-contractprofile.yamlcardinality_rangeallowed_valuesvalidate-contract{
"schema_version": "2.0",
"id": "skill-system-eda",
"version": "1.0.0",
"capabilities": [
"eda-profile",
"eda-distribution",
"eda-correlation",
"eda-anomaly",
"eda-feature-importance",
"eda-leakage",
"eda-contract-save",
"eda-contract-validate"
],
"effects": ["fs.read", "fs.write", "proc.exec"],
"operations": {
"profile-dataset": {
"description": "Profile a CSV/parquet dataset and generate profile.yaml plus report.md.",
"input": {
"input": { "type": "string", "required": true },
"target": { "type": "string", "required": false },
"output": { "type": "string", "required": true },
"sample": { "type": "integer", "required": false },
"no_memory": { "type": "boolean", "required": false }
},
"output": {
"description": "Artifact paths for the generated EDA profile",
"fields": { "profile": "string", "report": "string" }
},
"entrypoints": {
"unix": ["python3", "scripts/eda.py", "profile-dataset", "--input", "{input}", "--output", "{output}"]
}
},
"distribution-report": {
"description": "Append distribution and class-conditional analysis to an existing profile/report.",
"input": {
"input": { "type": "string", "required": true },
"target": { "type": "string", "required": true },
"profile": { "type": "string", "required": true }
},
"output": { "description": "Updated profile/report paths", "fields": { "profile": "string", "report": "string" } },
"entrypoints": {
"unix": ["python3", "scripts/eda.py", "distribution-report", "--input", "{input}", "--target", "{target}", "--profile", "{profile}"]
}
},
"correlation-matrix": {
"description": "Compute feature and target correlations and append them to profile/report.",
"input": {
"input": { "type": "string", "required": true },
"target": { "type": "string", "required": false },
"profile": { "type": "string", "required": true }
},
"output": { "description": "Updated profile/report paths", "fields": { "profile": "string", "report": "string" } },
"entrypoints": {
"unix": ["python3", "scripts/eda.py", "correlation-matrix", "--input", "{input}", "--profile", "{profile}"]
}
},
"anomaly-profiling": {
"description": "Compare class-conditional distributions and effect sizes.",
"input": {
"input": { "type": "string", "required": true },
"target": { "type": "string", "required": true },
"profile": { "type": "string", "required": true }
},
"output": { "description": "Updated profile/report paths", "fields": { "profile": "string", "report": "string" } },
"entrypoints": {
"unix": ["python3", "scripts/eda.py", "anomaly-profiling", "--input", "{input}", "--target", "{target}", "--profile", "{profile}"]
}
},
"feature-importance-scan": {
"description": "Rank features with mutual information and optional tree importances.",
"input": {
"input": { "type": "string", "required": true },
"target": { "type": "string", "required": true },
"profile": { "type": "string", "required": true }
},
"output": { "description": "Updated profile/report paths", "fields": { "profile": "string", "report": "string" } },
"entrypoints": {
"unix": ["python3", "scripts/eda.py", "feature-importance-scan", "--input", "{input}", "--target", "{target}", "--profile", "{profile}"]
}
},
"leakage-detector": {
"description": "Detect high-correlation, target-encoding, and temporal leakage indicators.",
"input": {
"input": { "type": "string", "required": true },
"target": { "type": "string", "required": true },
"profile": { "type": "string", "required": true }
},
"output": { "description": "Updated profile/report paths", "fields": { "profile": "string", "report": "string" } },
"entrypoints": {
"unix": ["python3", "scripts/eda.py", "leakage-detector", "--input", "{input}", "--target", "{target}", "--profile", "{profile}"]
}
},
"save-contract": {
"description": "Generate a data contract from a saved EDA profile.",
"input": {
"profile": { "type": "string", "required": true },
"output": { "type": "string", "required": true }
},
"output": { "description": "Contract path", "fields": { "contract": "string" } },
"entrypoints": {
"unix": ["python3", "scripts/eda.py", "save-contract", "--profile", "{profile}", "--output", "{output}"]
}
},
"validate-contract": {
"description": "Validate a new dataset against a saved contract and emit PASS/FAIL JSON.",
"input": {
"input": { "type": "string", "required": true },
"contract": { "type": "string", "required": true }
},
"output": { "description": "Validation status and violations", "fields": { "status": "string", "violations": "array" } },
"entrypoints": {
"unix": ["python3", "scripts/eda.py", "validate-contract", "--input", "{input}", "--contract", "{contract}"]
}
}
},
"stdout_contract": {
"last_line_json": true
}
}