Loading...
Loading...
Parse HWP, HWPX, and PDF Korean documents to Markdown using kordoc — supports CLI, programmatic API, and MCP server integration.
npx skill4agent add aradotso/trending-skills kordoc-korean-document-parserSkill by ara.so — Daily 2026 Skills collection.
IRBlock[]# Core library
npm install kordoc
# PDF support (optional peer dependency)
npm install pdfjs-dist
# CLI (no install needed)
npx kordoc document.hwpximport { parse } from "kordoc"
import { readFileSync } from "fs"
const buffer = readFileSync("document.hwpx")
const result = await parse(buffer.buffer) // ArrayBuffer required
if (result.success) {
console.log(result.markdown) // string: full Markdown
console.log(result.blocks) // IRBlock[]: structured data
console.log(result.metadata) // { title, author, createdAt, pageCount, ... }
console.log(result.outline) // OutlineItem[]: document structure
console.log(result.warnings) // ParseWarning[]: skipped elements
} else {
console.error(result.error) // string message
console.error(result.code) // ErrorCode: "ENCRYPTED" | "ZIP_BOMB" | "IMAGE_BASED_PDF" | ...
}import { parseHwpx, parseHwp, parsePdf, detectFormat } from "kordoc"
// Detect format first
const fmt = detectFormat(buffer.buffer) // "hwpx" | "hwp" | "pdf" | "unknown"
// Parse by format
const hwpxResult = await parseHwpx(buffer.buffer)
const hwpResult = await parseHwp(buffer.buffer)
const pdfResult = await parsePdf(buffer.buffer)import { parse, ParseOptions } from "kordoc"
const result = await parse(buffer.buffer, {
pages: "1-3", // page range string
// pages: [1, 5, 10], // or specific page numbers
ocr: async (pageImage, pageNumber, mimeType) => {
// Pluggable OCR for image-based PDFs
// pageImage: ArrayBuffer of the page image
return await myOcrService.recognize(pageImage)
}
})import type { IRBlock, IRBlockType, IRTable, IRCell } from "kordoc"
// IRBlock types: "heading" | "paragraph" | "table" | "list" | "image" | "separator"
for (const block of result.blocks) {
if (block.type === "heading") {
console.log(`H${block.level}: ${block.text}`)
console.log(block.bbox) // { x, y, width, height, page }
}
if (block.type === "table") {
const table = block as IRTable
for (const row of table.rows) {
for (const cell of row) {
console.log(cell.text, cell.colspan, cell.rowspan)
}
}
}
if (block.type === "paragraph") {
console.log(block.text)
console.log(block.style) // InlineStyle: { bold, italic, fontSize, ... }
console.log(block.pageNumber)
}
}import { blocksToMarkdown } from "kordoc"
const markdown = blocksToMarkdown(result.blocks)import { compare } from "kordoc"
const bufA = readFileSync("v1.hwp").buffer
const bufB = readFileSync("v2.hwpx").buffer // cross-format supported
const diff = await compare(bufA, bufB)
console.log(diff.stats)
// { added: 3, removed: 1, modified: 5, unchanged: 42 }
for (const d of diff.diffs) {
// d.type: "added" | "removed" | "modified" | "unchanged"
// d.blockA, d.blockB: IRBlock
// d.cellDiffs: CellDiff[] for table blocks
console.log(d.type, d.blockA?.text ?? d.blockB?.text)
}import { parse, extractFormFields } from "kordoc"
const result = await parse(buffer.buffer)
if (result.success) {
const form = extractFormFields(result.blocks)
console.log(form.confidence) // 0.0–1.0
for (const field of form.fields) {
// { label: "성명", value: "홍길동", row: 0, col: 0 }
console.log(`${field.label}: ${field.value}`)
}
}import { markdownToHwpx } from "kordoc"
import { writeFileSync } from "fs"
const markdown = `
# 제목
본문 내용입니다.
| 구분 | 내용 |
| --- | --- |
| 항목1 | 값1 |
| 항목2 | 값2 |
`
const hwpxBuffer = await markdownToHwpx(markdown)
writeFileSync("output.hwpx", Buffer.from(hwpxBuffer))# Basic conversion — output to stdout
npx kordoc document.hwpx
# Save to file
npx kordoc document.hwp -o output.md
# Batch convert all PDFs to a directory
npx kordoc *.pdf -d ./converted/
# JSON output with blocks + metadata
npx kordoc report.hwpx --format json
# Parse specific pages only
npx kordoc report.hwpx --pages 1-3
# Watch mode — auto-convert new files
npx kordoc watch ./incoming -d ./output
# Watch with webhook notification on conversion
npx kordoc watch ./docs --webhook https://api.example.com/hook{
"mcpServers": {
"kordoc": {
"command": "npx",
"args": ["-y", "kordoc-mcp"]
}
}
}| Tool | Description |
|---|---|
| Parse HWP/HWPX/PDF → Markdown + metadata + outline + warnings |
| Detect file format via magic bytes |
| Extract only metadata (fast, no full parse) |
| Parse a specific page range |
| Extract the Nth table from a document |
| Diff two documents (cross-format supported) |
| Extract form fields as structured JSON |
import type {
// Results
ParseResult, ParseSuccess, ParseFailure,
ErrorCode, // "ENCRYPTED" | "ZIP_BOMB" | "IMAGE_BASED_PDF" | ...
// Blocks
IRBlock, IRBlockType, IRTable, IRCell, CellContext,
// Metadata & structure
DocumentMetadata, OutlineItem,
ParseWarning, WarningCode,
BoundingBox, // { x, y, width, height, page }
InlineStyle, // { bold, italic, fontSize, color, ... }
// Options
ParseOptions, FileType,
OcrProvider, // async (image, pageNum, mime) => string
WatchOptions,
// Diff
DiffResult, BlockDiff, CellDiff, DiffChangeType,
// Forms
FormField, FormResult,
} from "kordoc"import { parse, detectFormat } from "kordoc"
import { readFileSync } from "fs"
import { glob } from "glob"
const files = await glob("./docs/**/*.{hwp,hwpx,pdf}")
for (const file of files) {
const buffer = readFileSync(file)
const fmt = detectFormat(buffer.buffer)
if (fmt === "unknown") {
console.warn(`Skipping unknown format: ${file}`)
continue
}
const result = await parse(buffer.buffer)
if (!result.success) {
if (result.code === "ENCRYPTED") {
console.warn(`Encrypted, skipping: ${file}`)
} else if (result.code === "IMAGE_BASED_PDF") {
console.warn(`Image-based PDF needs OCR: ${file}`)
} else {
console.error(`Failed: ${file} — ${result.error}`)
}
continue
}
console.log(`Parsed ${file}: ${result.blocks.length} blocks`)
}import { parse } from "kordoc"
import type { IRTable } from "kordoc"
const result = await parse(buffer.buffer)
if (result.success) {
const tables = result.blocks.filter(b => b.type === "table") as IRTable[]
tables.forEach((table, i) => {
console.log(`\n--- Table ${i + 1} ---`)
for (const row of table.rows) {
const cells = row.map(cell => cell.text.trim()).join(" | ")
console.log(`| ${cells} |`)
}
})
}import { parse } from "kordoc"
import Tesseract from "tesseract.js"
const result = await parse(buffer.buffer, {
ocr: async (pageImage, pageNumber, mimeType) => {
const blob = new Blob([pageImage], { type: mimeType })
const url = URL.createObjectURL(blob)
const { data } = await Tesseract.recognize(url, "kor+eng")
URL.revokeObjectURL(url)
return data.text
}
})import { watch } from "kordoc"
const watcher = watch("./incoming", {
output: "./converted",
webhook: process.env.WEBHOOK_URL,
onFile: async (file, result) => {
if (result.success) {
console.log(`Converted: ${file}`)
}
}
})
// Stop watching
watcher.stop()buffer.bufferBufferArrayBufferBufferreadFileSync("file").buffer.bufferUint8Arraynpm install pdfjs-dist"IMAGE_BASED_PDF"ocr"ENCRYPTED"pagesparse(buf, { pages: "1-5" })parse_metadataresult.metadata