pandas
Compare original and translation side by side
🇺🇸
Original
English🇨🇳
Translation
ChineseData Processing: pandas, openpyxl, lxml
数据处理:pandas、openpyxl、lxml
pandas Essentials
pandas 核心操作
Reading Data
读取数据
python
import pandas as pdpython
import pandas as pdCSV (handle various separators, encodings)
CSV (handle various separators, encodings)
df = pd.read_csv("data.csv", sep=';', encoding='utf-8')
df = pd.read_csv("data.csv", sep=';', encoding='utf-16', dtype_backend='numpy_nullable')
df = pd.read_csv("data.csv", sep=';', encoding='utf-8')
df = pd.read_csv("data.csv", sep=';', encoding='utf-16', dtype_backend='numpy_nullable')
Excel
Excel
df = pd.read_excel("data.xlsx", sheet_name="Motors", header=0)
df = pd.read_excel("data.xlsx", sheet_name=0, usecols="A:F", nrows=500)
df = pd.read_excel("data.xlsx", sheet_name="Motors", header=0)
df = pd.read_excel("data.xlsx", sheet_name=0, usecols="A:F", nrows=500)
Chunked reading for large files
Chunked reading for large files
for chunk in pd.read_csv("large.csv", sep=';', chunksize=10_000):
process(chunk)
undefinedfor chunk in pd.read_csv("large.csv", sep=';', chunksize=10_000):
process(chunk)
undefinedSelection and Filtering
数据选择与过滤
python
undefinedpython
undefinedColumn selection
Column selection
df['tag'] # Series
df[['tag', 'area', 'power_kw']] # DataFrame
df['tag'] # Series
df[['tag', 'area', 'power_kw']] # DataFrame
Row selection
Row selection
df.loc[df['area'] == 11301] # boolean mask
df.loc[df['area'].isin([11301, 11090])] # multiple values
df.loc[(df['area'] == 11301) & (df['active'])] # combined
df.loc[df['area'] == 11301] # boolean mask
df.loc[df['area'].isin([11301, 11090])] # multiple values
df.loc[(df['area'] == 11301) & (df['active'])] # combined
Positional
Positional
df.iloc[0] # first row
df.iloc[0:10, 0:3] # slice rows and columns
undefineddf.iloc[0] # first row
df.iloc[0:10, 0:3] # slice rows and columns
undefinedData Transformation
数据转换
python
undefinedpython
undefinedVectorized operations (fast — avoid .apply() for simple ops)
Vectorized operations (fast — avoid .apply() for simple ops)
df['normalized'] = df['value'] / df['value'].max()
df['tag_upper'] = df['tag'].str.upper().str.strip()
df['area_str'] = df['area'].astype(str)
df['normalized'] = df['value'] / df['value'].max()
df['tag_upper'] = df['tag'].str.upper().str.strip()
df['area_str'] = df['area'].astype(str)
Apply (for complex per-row logic)
Apply (for complex per-row logic)
df['node_name'] = df.apply(
lambda r: f"{r['area']}{r['equip_code']}{r['number']}", axis=1
)
df['node_name'] = df.apply(
lambda r: f"{r['area']}{r['equip_code']}{r['number']}", axis=1
)
Map (for value replacement)
Map (for value replacement)
df['type_label'] = df['type_code'].map({'M': 'Motor', 'V': 'Valve', 'P': 'PID'})
undefineddf['type_label'] = df['type_code'].map({'M': 'Motor', 'V': 'Valve', 'P': 'PID'})
undefinedAggregation
数据聚合
python
undefinedpython
undefinedGroup and aggregate
Group and aggregate
summary = df.groupby('area').agg(
count=('tag', 'count'),
total_power=('power_kw', 'sum'),
max_power=('power_kw', 'max'),
)
summary = df.groupby('area').agg(
count=('tag', 'count'),
total_power=('power_kw', 'sum'),
max_power=('power_kw', 'max'),
)
Pivot table
Pivot table
pivot = df.pivot_table(
values='power_kw', index='area', columns='type', aggfunc='sum', fill_value=0
)
undefinedpivot = df.pivot_table(
values='power_kw', index='area', columns='type', aggfunc='sum', fill_value=0
)
undefinedMerge / Join
数据合并/连接
python
undefinedpython
undefinedMerge on key (like SQL JOIN)
Merge on key (like SQL JOIN)
result = pd.merge(df_tags, df_params, on='tag', how='left')
result = pd.merge(df_tags, df_params, left_on='tag', right_on='TagName', how='inner')
result = pd.merge(df_tags, df_params, on='tag', how='left')
result = pd.merge(df_tags, df_params, left_on='tag', right_on='TagName', how='inner')
Concat (stack DataFrames)
Concat (stack DataFrames)
combined = pd.concat([df1, df2, df3], ignore_index=True)
undefinedcombined = pd.concat([df1, df2, df3], ignore_index=True)
undefinedWriting Data
写入数据
python
undefinedpython
undefinedCSV
CSV
df.to_csv("output.csv", index=False, sep=';', encoding='utf-8')
df.to_csv("output.csv", index=False, sep=';', encoding='utf-8')
Excel
Excel
df.to_excel("output.xlsx", sheet_name="Tags", index=False)
df.to_excel("output.xlsx", sheet_name="Tags", index=False)
Multiple sheets
Multiple sheets
with pd.ExcelWriter("report.xlsx", engine='openpyxl') as writer:
df_motors.to_excel(writer, sheet_name="Motors", index=False)
df_valves.to_excel(writer, sheet_name="Valves", index=False)
undefinedwith pd.ExcelWriter("report.xlsx", engine='openpyxl') as writer:
df_motors.to_excel(writer, sheet_name="Motors", index=False)
df_valves.to_excel(writer, sheet_name="Valves", index=False)
undefinedPerformance Tips
性能优化技巧
python
undefinedpython
undefinedUse categoricals for low-cardinality strings
Use categoricals for low-cardinality strings
df['area'] = df['area'].astype('category')
df['type'] = df['type'].astype('category')
df['area'] = df['area'].astype('category')
df['type'] = df['type'].astype('category')
Downcast numerics
Downcast numerics
df['count'] = pd.to_numeric(df['count'], downcast='integer')
df['value'] = pd.to_numeric(df['value'], downcast='float')
df['count'] = pd.to_numeric(df['count'], downcast='integer')
df['value'] = pd.to_numeric(df['value'], downcast='float')
Avoid object dtype for mixed types — specify explicitly
Avoid object dtype for mixed types — specify explicitly
df = pd.read_csv(path, dtype={'area': int, 'tag': str, 'value': float})
df = pd.read_csv(path, dtype={'area': int, 'tag': str, 'value': float})
Use .loc for assignment (avoid SettingWithCopyWarning)
Use .loc for assignment (avoid SettingWithCopyWarning)
df.loc[mask, 'col'] = value
---df.loc[mask, 'col'] = value
---openpyxl — Excel Read/Write
openpyxl — Excel 读写
Read with openpyxl
使用 openpyxl 读取数据
python
from openpyxl import load_workbookpython
from openpyxl import load_workbookRead-only for large files
Read-only for large files
wb = load_workbook("data.xlsx", read_only=True, data_only=True)
ws = wb["Motors"]
wb = load_workbook("data.xlsx", read_only=True, data_only=True)
ws = wb["Motors"]
Iterate rows (skipping header)
Iterate rows (skipping header)
for row in ws.iter_rows(min_row=2, values_only=True):
tag, desc, area, power = row[0], row[1], row[2], row[3]
for row in ws.iter_rows(min_row=2, values_only=True):
tag, desc, area, power = row[0], row[1], row[2], row[3]
Named ranges
Named ranges
cell_range = wb.defined_names["motor_list"]
for title, coord in cell_range.destinations:
ws = wb[title]
for row in ws[coord]:
print([cell.value for cell in row])
wb.close()
undefinedcell_range = wb.defined_names["motor_list"]
for title, coord in cell_range.destinations:
ws = wb[title]
for row in ws[coord]:
print([cell.value for cell in row])
wb.close()
undefinedWrite with openpyxl
使用 openpyxl 写入数据
python
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
wb = Workbook()
ws = wb.active
ws.title = "Generated Tags"python
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
wb = Workbook()
ws = wb.active
ws.title = "Generated Tags"Header row with style
Header row with style
headers = ["Tag", "Description", "Area", "Power (kW)"]
for col, header in enumerate(headers, 1):
cell = ws.cell(row=1, column=col, value=header)
cell.font = Font(bold=True)
cell.fill = PatternFill(fill_type="solid", fgColor="4472C4")
cell.font = Font(bold=True, color="FFFFFF")
headers = ["Tag", "Description", "Area", "Power (kW)"]
for col, header in enumerate(headers, 1):
cell = ws.cell(row=1, column=col, value=header)
cell.font = Font(bold=True)
cell.fill = PatternFill(fill_type="solid", fgColor="4472C4")
cell.font = Font(bold=True, color="FFFFFF")
Data rows
Data rows
for i, row_data in enumerate(rows, 2):
for col, value in enumerate(row_data, 1):
ws.cell(row=i, column=col, value=value)
for i, row_data in enumerate(rows, 2):
for col, value in enumerate(row_data, 1):
ws.cell(row=i, column=col, value=value)
Column widths
Column widths
ws.column_dimensions['A'].width = 25
ws.column_dimensions['B'].width = 40
wb.save("output.xlsx")
---ws.column_dimensions['A'].width = 25
ws.column_dimensions['B'].width = 40
wb.save("output.xlsx")
---lxml — XML/HTML Parsing
lxml — XML/HTML 解析
Parse XML
解析 XML
python
from lxml import etreepython
from lxml import etreeParse from file
Parse from file
tree = etree.parse("config.xml")
root = tree.getroot()
tree = etree.parse("config.xml")
root = tree.getroot()
Parse from string
Parse from string
root = etree.fromstring(b"<root><item id='1'>text</item></root>")
root = etree.fromstring(b"<root><item id='1'>text</item></root>")
XPath without namespaces
XPath without namespaces
items = root.xpath("//item[@id]")
texts = root.xpath("//item/text()")
items = root.xpath("//item[@id]")
texts = root.xpath("//item/text()")
XPath with namespaces
XPath with namespaces
ns = {'plc': 'http://www.plcopen.org/xml/tc6_0201'}
pous = root.xpath("//plc:pou", namespaces=ns)
undefinedns = {'plc': 'http://www.plcopen.org/xml/tc6_0201'}
pous = root.xpath("//plc:pou", namespaces=ns)
undefinedBuild XML
构建 XML
python
from lxml import etree
def build_tag_xml(tags: list[dict]) -> bytes:
root = etree.Element("TagList")
for tag_data in tags:
tag_el = etree.SubElement(root, "Tag")
tag_el.set("name", tag_data["name"])
tag_el.set("area", str(tag_data["area"]))
desc_el = etree.SubElement(tag_el, "Description")
desc_el.text = tag_data.get("description", "")
return etree.tostring(root, pretty_print=True, xml_declaration=True, encoding="UTF-8")python
from lxml import etree
def build_tag_xml(tags: list[dict]) -> bytes:
root = etree.Element("TagList")
for tag_data in tags:
tag_el = etree.SubElement(root, "Tag")
tag_el.set("name", tag_data["name"])
tag_el.set("area", str(tag_data["area"]))
desc_el = etree.SubElement(tag_el, "Description")
desc_el.text = tag_data.get("description", "")
return etree.tostring(root, pretty_print=True, xml_declaration=True, encoding="UTF-8")Namespace Handling
命名空间处理
python
undefinedpython
undefinedStrip namespace prefixes for simpler XPath
Strip namespace prefixes for simpler XPath
def strip_ns(tree: etree._Element) -> etree._Element:
for el in tree.iter():
if el.tag.startswith('{'):
el.tag = el.tag.split('}', 1)[1]
return tree
def strip_ns(tree: etree._Element) -> etree._Element:
for el in tree.iter():
if el.tag.startswith('{'):
el.tag = el.tag.split('}', 1)[1]
return tree
Or use Clark notation in XPath
Or use Clark notation in XPath
ns = {'ns': 'http://example.com/schema'}
elements = root.xpath('/ns:root/ns:items/ns:item', namespaces=ns)
---ns = {'ns': 'http://example.com/schema'}
elements = root.xpath('/ns:root/ns:items/ns:item', namespaces=ns)
---File Encoding Handling
文件编码处理
UTF-16LE (industrial formats: ABB Freelance, etc.)
UTF-16LE(工业格式:ABB Freelance 等)
python
import codecs
def read_utf16(path: str) -> str:
"""Auto-detect UTF-16 LE/BE via BOM."""
with codecs.open(path, 'r', 'utf-16') as f:
return f.read()
def write_utf16le(path: str, content: str) -> None:
"""Write UTF-16LE with BOM (required by ABB Freelance)."""
with codecs.open(path, 'w', 'utf-16-le') as f:
f.write('\ufeff' + content) # \ufeff = UTF-16LE BOM
def detect_encoding(path: str) -> str:
"""Detect encoding via BOM bytes."""
with open(path, 'rb') as f:
bom = f.read(4)
if bom.startswith(b'\xff\xfe'):
return 'utf-16-le'
elif bom.startswith(b'\xfe\xff'):
return 'utf-16-be'
elif bom.startswith(b'\xef\xbb\xbf'):
return 'utf-8-sig'
return 'utf-8'python
import codecs
def read_utf16(path: str) -> str:
"""Auto-detect UTF-16 LE/BE via BOM."""
with codecs.open(path, 'r', 'utf-16') as f:
return f.read()
def write_utf16le(path: str, content: str) -> None:
"""Write UTF-16LE with BOM (required by ABB Freelance)."""
with codecs.open(path, 'w', 'utf-16-le') as f:
f.write('\ufeff' + content) # \ufeff = UTF-16LE BOM
def detect_encoding(path: str) -> str:
"""Detect encoding via BOM bytes."""
with open(path, 'rb') as f:
bom = f.read(4)
if bom.startswith(b'\xff\xfe'):
return 'utf-16-le'
elif bom.startswith(b'\xfe\xff'):
return 'utf-16-be'
elif bom.startswith(b'\xef\xbb\xbf'):
return 'utf-8-sig'
return 'utf-8'Common Patterns
常见模式
Template-Based Bulk File Generation
基于模板的批量文件生成
python
import codecs
import re
def generate_instances(template_path: str, rows: list[dict], output_dir: str) -> list[str]:
template = read_utf16(template_path)
generated = []
for row in rows:
content = template
for placeholder, value in row.items():
content = content.replace(f"{{{placeholder}}}", str(value))
# Reset checksum
content = re.sub(r'\[CHECKSUM\];.*', '[CHECKSUM];0000000000', content)
out_path = f"{output_dir}/{row['node_name']}.prt"
write_utf16le(out_path, content)
generated.append(out_path)
return generatedpython
import codecs
import re
def generate_instances(template_path: str, rows: list[dict], output_dir: str) -> list[str]:
template = read_utf16(template_path)
generated = []
for row in rows:
content = template
for placeholder, value in row.items():
content = content.replace(f"{{{placeholder}}}", str(value))
# Reset checksum
content = re.sub(r'\[CHECKSUM\];.*', '[CHECKSUM];0000000000', content)
out_path = f"{output_dir}/{row['node_name']}.prt"
write_utf16le(out_path, content)
generated.append(out_path)
return generatedValidate Excel Input Before Processing
处理前验证 Excel 输入
python
REQUIRED_COLUMNS = {'tag', 'description', 'area', 'power_kw'}
def validate_input_excel(path: str, sheet: str) -> pd.DataFrame:
df = pd.read_excel(path, sheet_name=sheet)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
missing = REQUIRED_COLUMNS - set(df.columns)
if missing:
raise ValueError(f"Missing required columns: {missing}")
df = df.dropna(subset=['tag']) # drop rows without tag
df['area'] = pd.to_numeric(df['area'], errors='coerce').astype('Int64')
invalid = df[df['area'].isna()]
if not invalid.empty:
raise ValueError(f"{len(invalid)} rows with invalid area code")
return dfpython
REQUIRED_COLUMNS = {'tag', 'description', 'area', 'power_kw'}
def validate_input_excel(path: str, sheet: str) -> pd.DataFrame:
df = pd.read_excel(path, sheet_name=sheet)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
missing = REQUIRED_COLUMNS - set(df.columns)
if missing:
raise ValueError(f"Missing required columns: {missing}")
df = df.dropna(subset=['tag']) # drop rows without tag
df['area'] = pd.to_numeric(df['area'], errors='coerce').astype('Int64')
invalid = df[df['area'].isna()]
if not invalid.empty:
raise ValueError(f"{len(invalid)} rows with invalid area code")
return df