pdf-read
Compare original and translation side by side
🇺🇸
Original
English🇨🇳
Translation
ChinesePDF Read Skill
PDF读取技能
Extract text and metadata from PDF files.
从PDF文件中提取文本和元数据。
When to Use
适用场景
✅ USE this skill when:
- User uploads a PDF and asks for summary
- Extract text from a PDF document
- Read PDF metadata (author, title, pages)
- Analyze PDF content
✅ 适用本技能的场景:
- 用户上传PDF并要求生成摘要
- 从PDF文档中提取文本
- 读取PDF元数据(作者、标题、页数)
- 分析PDF内容
When NOT to Use
不适用场景
❌ DON'T use this skill when:
- Creating or generating PDFs → use reporting tools
- Editing existing PDFs → use PDF manipulation tools
- OCR on scanned images → use OCR/tesseract tools
- Password-protected PDFs → ask user to unlock first
❌ 请勿使用本技能的场景:
- 创建或生成PDF → 使用报表工具
- 编辑现有PDF → 使用PDF处理工具
- 对扫描图像进行OCR识别 → 使用OCR/tesseract工具
- 受密码保护的PDF → 先请用户解锁
Installation
安装
bash
cd /job
npm install pdf-parsebash
cd /job
npm install pdf-parseUsage
使用方法
javascript
const fs = require('fs');
const pdf = require('pdf-parse');
async function readPDF(filePath) {
const dataBuffer = fs.readFileSync(filePath);
const data = await pdf(dataBuffer);
return {
text: data.text,
pages: data.numpages,
info: data.info, // metadata
version: data.version,
metadata: data.metadata
};
}
// Example
const result = await readPDF('/path/to/document.pdf');
console.log(`Pages: ${result.pages}`);
console.log(`Text preview: ${result.text.substring(0, 500)}...`);javascript
const fs = require('fs');
const pdf = require('pdf-parse');
async function readPDF(filePath) {
const dataBuffer = fs.readFileSync(filePath);
const data = await pdf(dataBuffer);
return {
text: data.text,
pages: data.numpages,
info: data.info, // metadata
version: data.version,
metadata: data.metadata
};
}
// Example
const result = await readPDF('/path/to/document.pdf');
console.log(`Pages: ${result.pages}`);
console.log(`Text preview: ${result.text.substring(0, 500)}...`);Extract Text by Page Range
按页码范围提取文本
javascript
const pdf = require('pdf-parse');
const fs = require('fs');
async function readPDFPages(filePath, startPage, endPage) {
const dataBuffer = fs.readFileSync(filePath);
const data = await pdf(dataBuffer, {
max: endPage,
version: 'v2.0.550',
normalizeWhitespace: true,
});
return data.text;
}javascript
const pdf = require('pdf-parse');
const fs = require('fs');
async function readPDFPages(filePath, startPage, endPage) {
const dataBuffer = fs.readFileSync(filePath);
const data = await pdf(dataBuffer, {
max: endPage,
version: 'v2.0.550',
normalizeWhitespace: true,
});
return data.text;
}Get Metadata
获取元数据
javascript
const result = await readPDF('/path/to/document.pdf');
console.log('Author:', result.info?.Author);
console.log('Title:', result.info?.Title);
console.log('Subject:', result.info?.Subject);
console.log('Keywords:', result.info?.Keywords);
console.log('Creator:', result.info?.Creator);
console.log('Producer:', result.info?.Producer);
console.log('Creation Date:', result.info?.CreationDate);
console.log('Mod Date:', result.info?.ModDate);javascript
const result = await readPDF('/path/to/document.pdf');
console.log('Author:', result.info?.Author);
console.log('Title:', result.info?.Title);
console.log('Subject:', result.info?.Subject);
console.log('Keywords:', result.info?.Keywords);
console.log('Creator:', result.info?.Creator);
console.log('Producer:', result.info?.Producer);
console.log('Creation Date:', result.info?.CreationDate);
console.log('Mod Date:', result.info?.ModDate);Search Text in PDF
在PDF中搜索文本
javascript
async function searchInPDF(filePath, searchTerm) {
const result = await readPDF(filePath);
const text = result.text;
const lines = text.split('\n');
const matches = [];
lines.forEach((line, index) => {
if (line.toLowerCase().includes(searchTerm.toLowerCase())) {
matches.push({
line: index + 1,
content: line.trim()
});
}
});
return {
total_matches: matches.length,
matches: matches.slice(0, 20) // limit results
};
}javascript
async function searchInPDF(filePath, searchTerm) {
const result = await readPDF(filePath);
const text = result.text;
const lines = text.split('\n');
const matches = [];
lines.forEach((line, index) => {
if (line.toLowerCase().includes(searchTerm.toLowerCase())) {
matches.push({
line: index + 1,
content: line.trim()
});
}
});
return {
total_matches: matches.length,
matches: matches.slice(0, 20) // limit results
};
}Extract All Text as Single String
将所有文本提取为单个字符串
javascript
async function extractFullText(filePath) {
const result = await readPDF(filePath);
// Normalize whitespace for cleaner output
return result.text.replace(/\s+/g, ' ').trim();
}javascript
async function extractFullText(filePath) {
const result = await readPDF(filePath);
// Normalize whitespace for cleaner output
return result.text.replace(/\s+/g, ' ').trim();
}Handling Large PDFs
处理大体积PDF
For PDFs with many pages, process in chunks:
javascript
async function readPDFFirstNPages(filePath, maxPages = 10) {
const dataBuffer = fs.readFileSync(filePath);
const data = await pdf(dataBuffer, { max: maxPages });
return {
text: data.text,
total_pages: data.numpages,
pages_read: Math.min(maxPages, data.numpages)
};
}对于页数较多的PDF,分块处理:
javascript
async function readPDFFirstNPages(filePath, maxPages = 10) {
const dataBuffer = fs.readFileSync(filePath);
const data = await pdf(dataBuffer, { max: maxPages });
return {
text: data.text,
total_pages: data.numpages,
pages_read: Math.min(maxPages, data.numpages)
};
}Error Handling
错误处理
javascript
async function safeReadPDF(filePath) {
try {
const result = await readPDF(filePath);
return { success: true, ...result };
} catch (error) {
if (error.message.includes('password')) {
return { success: false, error: 'PDF is password-protected' };
}
if (error.message.includes('parse')) {
return { success: false, error: 'Invalid or corrupted PDF' };
}
return { success: false, error: error.message };
}
}javascript
async function safeReadPDF(filePath) {
try {
const result = await readPDF(filePath);
return { success: true, ...result };
} catch (error) {
if (error.message.includes('password')) {
return { success: false, error: 'PDF is password-protected' };
}
if (error.message.includes('parse')) {
return { success: false, error: 'Invalid or corrupted PDF' };
}
return { success: false, error: error.message };
}
}Quick Response Template
快速响应模板
"Read this PDF"
javascript
const result = await readPDF(filePath);
return `📄 **PDF Summary**
**Pages:** ${result.pages}
**Title:** ${result.info?.Title || 'N/A'}
**Author:** ${result.info?.Author || 'N/A'}
**Preview (first 500 chars):**
${result.text.substring(0, 500)}...
`;"读取此PDF"
javascript
const result = await readPDF(filePath);
return `📄 **PDF摘要**
**页数:** ${result.pages}
**标题:** ${result.info?.Title || '无'}
**作者:** ${result.info?.Author || '无'}
**预览(前500字符):**
${result.text.substring(0, 500)}...
`;Notes
注意事项
- pdf-parse works on most standard PDFs
- Does NOT support OCR for scanned documents
- Does NOT handle password-protected PDFs
- For image-heavy PDFs, text extraction may be limited
- Large PDFs (>100 pages) should be read in chunks
- pdf-parse适用于大多数标准PDF
- 不支持对扫描文档进行OCR识别
- 无法处理受密码保护的PDF
- 对于图片较多的PDF,文本提取效果可能受限
- 大体积PDF(>100页)应分块读取