Loading...
Loading...
万行以上 Excel 数据集的高性能分析引擎。提供 openpyxl read_only 流式读取(iter_rows 支持 10 万行以上)、Parquet 转换加速、内存优化、分块处理和大文件写入模式。**遇到以下任一情况就主动使用本 skill**:①数据行数 ≥ 10k(由 sn-da-excel-workflow 的行数评估步骤触发);②用户出现触发词:大文件 / 大数据量 / 性能优化 / 内存不足 / OOM / 百万行 / 十万行 / 流式读取 / Parquet / 分块处理 / large file / big data / streaming read / chunked processing;③直接使用 pd.read_excel() 导致超时或内存溢出;④用户明确要求对大规模数据集进行高性能处理。仅不用于:小于 10k 行的常规 Excel 分析(使用 sn-da-excel-workflow 即可)。
npx skill4agent add opensensenova/sensenova-skills sn-da-large-file-analysisWhen total rows >= 10,000, you MUST use the methods in this skill.
| Data Scale | Read Strategy | Reason |
|---|---|---|
| < 10k rows | | No memory pressure |
| 10k–100k rows | | Avoid repeated slow reads |
| 100k–1M rows | openpyxl | |
| > 1M rows | Streaming read + multi-sheet split (Excel max 1,048,576 rows per sheet) | Must chunk |
pd.read_excel()fc-listfind ... fontspip installdf.iterrows()itertuples()df.apply(lambda...)import pandas as pd
import numpy as np
import os
import gc
pd.options.mode.copy_on_write = True
# CJK font setup (fixed paths — do NOT search for fonts)
# ⚠️ Copy this block as-is. Do NOT use fc-list, find, subprocess, or glob to locate fonts.
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
_FONT_PATHS = [
'/mnt/afs_agents/SimHei.ttf',
'/mnt/afs_agents/mnt/data/SimHei.ttf',
os.path.expanduser('~/.fonts/SimHei.ttf'),
'/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc',
'/usr/share/fonts/SimHei.ttf',
]
for _p in _FONT_PATHS:
if os.path.exists(_p):
fm.fontManager.addfont(_p)
matplotlib.rcParams['font.family'] = fm.FontProperties(fname=_p).get_name()
break
matplotlib.rcParams['axes.unicode_minus'] = Falseimport openpyxl
def inspect_excel(file_path):
"""Stream-inspect Excel structure. Returns {sheet_name: {rows, columns}}."""
wb = openpyxl.load_workbook(file_path, read_only=True, data_only=True)
info = {}
for name in wb.sheetnames:
ws = wb[name]
row_count = 0
header = None
for i, row in enumerate(ws.iter_rows(values_only=True)):
if i == 0:
header = [str(c) if c is not None else f"Col_{j}" for j, c in enumerate(row)]
else:
row_count += 1
info[name] = {"rows": row_count, "columns": header}
wb.close()
return info
# Usage
file_info = inspect_excel(file_path)
for sheet, meta in file_info.items():
print(f"Sheet '{sheet}': {meta['rows']} rows, {len(meta['columns'])} cols")
print(f" Columns: {meta['columns'][:10]}...")
total_rows = sum(m['rows'] for m in file_info.values())
print(f"Total rows: {total_rows}")pd.read_excel()import openpyxl
import pyarrow as pa
import pyarrow.parquet as pq
def stream_excel_to_parquet(excel_path, parquet_path, sheet_name=None, chunk_size=50000):
"""Stream Excel rows to Parquet with constant memory usage.
All columns are cast to string to avoid cross-chunk schema mismatches
(Excel mixed-type columns may be all-None in some chunks, causing PyArrow
to infer null type instead of string). Convert numeric columns after loading
Parquet with pd.to_numeric() as needed.
"""
wb = openpyxl.load_workbook(excel_path, read_only=True, data_only=True)
ws = wb[sheet_name] if sheet_name else wb.active
header = None
writer = None
chunk_rows = []
total_written = 0
def _flush(rows):
nonlocal writer
table = pa.table({
col: pa.array(
[str(r[idx]) if r[idx] is not None else None for r in rows],
type=pa.string(),
)
for idx, col in enumerate(header)
})
if writer is None:
writer = pq.ParquetWriter(parquet_path, table.schema)
writer.write_table(table)
for i, row in enumerate(ws.iter_rows(values_only=True)):
if i == 0:
header = [str(c) if c is not None else f"Col_{j}" for j, c in enumerate(row)]
continue
chunk_rows.append(list(row))
if len(chunk_rows) >= chunk_size:
_flush(chunk_rows)
total_written += len(chunk_rows)
print(f" Written {total_written:,} rows...")
chunk_rows = []
gc.collect()
if chunk_rows:
_flush(chunk_rows)
total_written += len(chunk_rows)
if writer:
writer.close()
wb.close()
print(f"Done: {total_written:,} rows -> {parquet_path}")
return total_writtenpd.read_excel()def convert_excel_to_parquet(excel_path, parquet_path, sheet_name=0):
"""Medium file: pd.read_excel -> Parquet cache."""
if os.path.exists(parquet_path):
print(f"Cache exists: {parquet_path}")
return
df = pd.read_excel(excel_path, sheet_name=sheet_name)
df.columns = df.columns.astype(str)
df.to_parquet(parquet_path, engine='pyarrow', compression='snappy')
row_count = len(df)
del df
gc.collect()
print(f"Converted {row_count:,} rows -> {parquet_path}")def optimize_dtypes(df):
"""Auto-downcast numeric types + convert low-cardinality strings to Category.
Typically saves 50-80% memory."""
start_mb = df.memory_usage(deep=True).sum() / 1024**2
for col in df.select_dtypes(include=['int64', 'int32']).columns:
c_min, c_max = df[col].min(), df[col].max()
if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
for col in df.select_dtypes(include=['float64']).columns:
df[col] = df[col].astype(np.float32)
for col in df.select_dtypes(include=['object', 'string']).columns:
if df[col].nunique() / max(len(df), 1) < 0.5:
df[col] = df[col].astype('category')
end_mb = df.memory_usage(deep=True).sum() / 1024**2
print(f"Memory: {start_mb:.1f} MB -> {end_mb:.1f} MB (saved {(1 - end_mb/start_mb)*100:.0f}%)")
return dfdef write_large_excel(df, output_path, sheet_name="Sheet1"):
"""Auto-select write strategy based on data size."""
total_cells = len(df) * len(df.columns)
if len(df) > 1_000_000:
csv_path = output_path.rsplit('.', 1)[0] + '.csv'
df.to_csv(csv_path, index=False)
print(f"Over 1M rows — exported as CSV: {csv_path}")
return csv_path
if total_cells > 50_000:
from openpyxl import Workbook
from openpyxl.cell import WriteOnlyCell
wb = Workbook(write_only=True)
ws = wb.create_sheet(title=sheet_name)
ws.append(list(df.columns))
for idx, row in enumerate(df.itertuples(index=False)):
ws.append([None if pd.isna(v) else v for v in row])
if (idx + 1) % 100_000 == 0:
print(f" Written {idx + 1:,} rows...")
wb.save(output_path)
wb.close()
print(f"write_only mode: {len(df):,} rows -> {output_path}")
else:
df.to_excel(output_path, index=False, sheet_name=sheet_name)
print(f"Standard write: {len(df):,} rows -> {output_path}")
return output_pathimport pandas as pd
import os, gc
excel_path = "sales_100k.xlsx"
parquet_path = "sales_100k.parquet"
# === Step 1: Inspect structure ===
file_info = inspect_excel(excel_path)
total_rows = sum(m['rows'] for m in file_info.values())
print(f"Total rows: {total_rows}")
# === Step 2: Choose read strategy by row count ===
if total_rows >= 100_000:
stream_excel_to_parquet(excel_path, parquet_path)
else:
convert_excel_to_parquet(excel_path, parquet_path)
# === Step 3: Load Parquet + optimize memory ===
df = pd.read_parquet(parquet_path)
df = optimize_dtypes(df)
print(f"Shape: {df.shape}")
print(df.head(3))
# === Step 4: Analysis ===
region_sales = df.groupby('Region')['Sales'].sum().sort_values(ascending=False)
print(region_sales)
# === Step 5: Visualization ===
fig, ax = plt.subplots(figsize=(10, 6))
region_sales.plot(kind='bar', ax=ax, color='#4C72B0')
ax.set_title('Sales by Region')
ax.set_ylabel('Sales')
plt.tight_layout()
plt.savefig('region_sales.png', dpi=150, bbox_inches='tight')
plt.show()
# === Step 6: Cleanup ===
del df
gc.collect()import pandas as pd
import os, gc
excel_path = "transactions_1m.xlsx"
parquet_path = "transactions_1m.parquet"
# === Step 1: Stream to Parquet (1M rows — MUST use streaming, never pd.read_excel) ===
stream_excel_to_parquet(excel_path, parquet_path, chunk_size=50000)
# === Step 2: Load only needed columns (saves memory) ===
df = pd.read_parquet(parquet_path, columns=['TransactionID', 'Amount', 'Date', 'Type'])
df = optimize_dtypes(df)
print(f"Shape: {df.shape}, Memory: {df.memory_usage(deep=True).sum()/1024**2:.1f} MB")
# === Step 3: Vectorized filtering (never use apply/iterrows) ===
mask = df['Amount'] > 10000
high_value = df[mask].copy()
print(f"Filtered: {len(high_value):,} / {len(df):,} rows")
# === Step 4: Export ===
output_path = write_large_excel(high_value, 'high_value_transactions.xlsx')
# === Step 5: Cleanup ===
del df, high_value
gc.collect()| Slow (Prohibited) | Fast (Use This) |
|---|---|
| |
| |
| Vectorized boolean indexing |
| |
| |
Estimated MB ≈ rows × cols × 8 / 1024² (numeric columns)
Estimated MB ≈ rows × cols × 50 / 1024² (with text columns)| Rows | 20 cols (numeric) | 20 cols (with text) |
|---|---|---|
| 100k | ~15 MB | ~95 MB |
| 500k | ~76 MB | ~477 MB |
| 1M | ~153 MB | ~953 MB |
pd.read_parquet(columns=[...])read_onlyiter_rowspd.read_excel()del df; gc.collect()openpyxl Workbook(write_only=True)