Loading...
Loading...
Compare original and translation side by side
import great_expectations as gx
context = gx.get_context()import great_expectations as gx
context = gx.get_context()undefinedundefineddef validate_data_quality(df):
issues = []
# Check for nulls
null_counts = df.isnull().sum()
if null_counts.any():
issues.append(f"Null values found: {null_counts[null_counts > 0]}")
# Check for duplicates
duplicates = df.duplicated().sum()
if duplicates > 0:
issues.append(f"Found {duplicates} duplicate rows")
# Check data freshness
max_date = df['created_at'].max()
if (datetime.now() - max_date).days > 1:
issues.append("Data is stale")
return issuesdef validate_data_quality(df):
issues = []
# 检查空值
null_counts = df.isnull().sum()
if null_counts.any():
issues.append(f"发现空值: {null_counts[null_counts > 0]}")
# 检查重复项
duplicates = df.duplicated().sum()
if duplicates > 0:
issues.append(f"发现 {duplicates} 条重复行")
# 检查数据新鲜度
max_date = df['created_at'].max()
if (datetime.now() - max_date).days > 1:
issues.append("数据已过期")
return issuesdef calculate_quality_metrics(df):
return {
'completeness': 1 - (df.isnull().sum().sum() / df.size),
'uniqueness': df.drop_duplicates().shape[0] / df.shape[0],
'validity': (df['email'].str.contains('@').sum() / len(df)),
'timeliness': (datetime.now() - df['created_at'].max()).days
}def calculate_quality_metrics(df):
return {
'completeness': 1 - (df.isnull().sum().sum() / df.size),
'uniqueness': df.drop_duplicates().shape[0] / df.shape[0],
'validity': (df['email'].str.contains('@').sum() / len(df)),
'timeliness': (datetime.now() - df['created_at'].max()).days
}