graphrag-patterns
Compare original and translation side by side
🇺🇸
Original
English🇨🇳
Translation
ChineseGraphRAG Patterns
GraphRAG 模式
Combine knowledge graphs with RAG for relationship-aware retrieval and reasoning.
结合知识图谱与RAG,实现具备关系感知能力的检索与推理。
When to Use
适用场景
- Data has rich entity relationships
- Questions involve connections ("How is X related to Y?")
- Need multi-hop reasoning across documents
- Building over structured + unstructured data
- Want explainable retrieval paths
- 数据包含丰富的实体关系
- 问题涉及实体间的关联(例如:“X与Y有何关联?”)
- 需要跨文档的多跳推理
- 基于结构化+非结构化数据构建系统
- 需要可解释的检索路径
GraphRAG Architecture
GraphRAG 架构
┌──────────────────────────────────────────────────────────┐
│ Documents │
└─────────────────────────┬────────────────────────────────┘
│
┌───────────────┼───────────────┐
│ │ │
▼ ▼ ▼
┌────────────┐ ┌────────────┐ ┌────────────┐
│ Entity │ │ Vector │ │ Text │
│ Extraction │ │ Embeddings │ │ Chunks │
└─────┬──────┘ └─────┬──────┘ └─────┬──────┘
│ │ │
▼ │ │
┌────────────┐ │ │
│ Knowledge │ │ │
│ Graph │ │ │
└─────┬──────┘ │ │
│ │ │
└───────────────┼───────────────┘
│
▼
┌─────────────────────┐
│ Hybrid Index │
│ (Graph + Vectors) │
└──────────┬──────────┘
│
▼
┌─────────────────────┐
│ Graph-Aware RAG │
└─────────────────────┘┌──────────────────────────────────────────────────────────┐
│ Documents │
└─────────────────────────┬────────────────────────────────┘
│
┌───────────────┼───────────────┐
│ │ │
▼ ▼ ▼
┌────────────┐ ┌────────────┐ ┌────────────┐
│ Entity │ │ Vector │ │ Text │
│ Extraction │ │ Embeddings │ │ Chunks │
└─────┬──────┘ └─────┬──────┘ └─────┬──────┘
│ │ │
▼ │ │
┌────────────┐ │ │
│ Knowledge │ │ │
│ Graph │ │ │
└─────┬──────┘ │ │
│ │ │
└───────────────┼───────────────┘
│
▼
┌─────────────────────┐
│ Hybrid Index │
│ (Graph + Vectors) │
└──────────┬──────────┘
│
▼
┌─────────────────────┐
│ Graph-Aware RAG │
└─────────────────────┘Building the Knowledge Graph
构建知识图谱
Entity & Relationship Extraction
实体与关系提取
python
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
EXTRACTION_PROMPT = """Extract entities and relationships from the text.
Text: {text}
Return JSON:
{{
"entities": [
{{"name": "...", "type": "PERSON|ORG|PRODUCT|CONCEPT|...", "description": "..."}}
],
"relationships": [
{{"source": "...", "target": "...", "type": "WORKS_FOR|USES|RELATED_TO|...", "description": "..."}}
]
}}
"""
def extract_graph_elements(text: str) -> dict:
llm = ChatOpenAI(model="gpt-4", temperature=0)
prompt = ChatPromptTemplate.from_template(EXTRACTION_PROMPT)
chain = prompt | llm
result = chain.invoke({"text": text})
return json.loads(result.content)python
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
EXTRACTION_PROMPT = """Extract entities and relationships from the text.
Text: {text}
Return JSON:
{{
"entities": [
{{"name": "...", "type": "PERSON|ORG|PRODUCT|CONCEPT|...", "description": "..."}}
],
"relationships": [
{{"source": "...", "target": "...", "type": "WORKS_FOR|USES|RELATED_TO|...", "description": "..."}}
]
}}
"""
def extract_graph_elements(text: str) -> dict:
llm = ChatOpenAI(model="gpt-4", temperature=0)
prompt = ChatPromptTemplate.from_template(EXTRACTION_PROMPT)
chain = prompt | llm
result = chain.invoke({"text": text})
return json.loads(result.content)Store in Neo4j
存储至Neo4j
python
from neo4j import GraphDatabase
class GraphStore:
def __init__(self, uri, user, password):
self.driver = GraphDatabase.driver(uri, auth=(user, password))
def add_entity(self, entity: dict):
with self.driver.session() as session:
session.run("""
MERGE (e:Entity {name: $name})
SET e.type = $type, e.description = $description
""",
name=entity["name"],
type=entity["type"],
description=entity["description"]
)
def add_relationship(self, rel: dict):
with self.driver.session() as session:
session.run("""
MATCH (a:Entity {name: $source})
MATCH (b:Entity {name: $target})
MERGE (a)-[r:RELATES {type: $type}]->(b)
SET r.description = $description
""",
source=rel["source"],
target=rel["target"],
type=rel["type"],
description=rel["description"]
)
def get_neighbors(self, entity: str, hops: int = 2) -> list:
with self.driver.session() as session:
result = session.run("""
MATCH path = (e:Entity {name: $name})-[*1..$hops]-(related)
RETURN path
""",
name=entity, hops=hops
)
return [record["path"] for record in result]python
from neo4j import GraphDatabase
class GraphStore:
def __init__(self, uri, user, password):
self.driver = GraphDatabase.driver(uri, auth=(user, password))
def add_entity(self, entity: dict):
with self.driver.session() as session:
session.run("""
MERGE (e:Entity {name: $name})
SET e.type = $type, e.description = $description
""",
name=entity["name"],
type=entity["type"],
description=entity["description"]
)
def add_relationship(self, rel: dict):
with self.driver.session() as session:
session.run("""
MATCH (a:Entity {name: $source})
MATCH (b:Entity {name: $target})
MERGE (a)-[r:RELATES {type: $type}]->(b)
SET r.description = $description
""",
source=rel["source"],
target=rel["target"],
type=rel["type"],
description=rel["description"]
)
def get_neighbors(self, entity: str, hops: int = 2) -> list:
with self.driver.session() as session:
result = session.run("""
MATCH path = (e:Entity {name: $name})-[*1..$hops]-(related)
RETURN path
""",
name=entity, hops=hops
)
return [record["path"] for record in result]GraphRAG Retrieval Strategies
GraphRAG 检索策略
1. Entity-Centric Retrieval
1. 实体中心检索
python
def entity_centric_retrieve(query: str, graph: GraphStore, vectorstore) -> list:
"""Extract entities from query, expand via graph, retrieve chunks."""
# Extract entities from query
entities = extract_entities(query)
# Get graph neighbors
expanded_entities = set(entities)
for entity in entities:
neighbors = graph.get_neighbors(entity, hops=2)
expanded_entities.update(neighbors)
# Retrieve chunks mentioning these entities
chunks = []
for entity in expanded_entities:
results = vectorstore.similarity_search(
entity,
k=3,
filter={"entities": {"$contains": entity}}
)
chunks.extend(results)
return deduplicate(chunks)python
def entity_centric_retrieve(query: str, graph: GraphStore, vectorstore) -> list:
"""Extract entities from query, expand via graph, retrieve chunks."""
# Extract entities from query
entities = extract_entities(query)
# Get graph neighbors
expanded_entities = set(entities)
for entity in entities:
neighbors = graph.get_neighbors(entity, hops=2)
expanded_entities.update(neighbors)
# Retrieve chunks mentioning these entities
chunks = []
for entity in expanded_entities:
results = vectorstore.similarity_search(
entity,
k=3,
filter={"entities": {"$contains": entity}}
)
chunks.extend(results)
return deduplicate(chunks)2. Path-Based Retrieval
2. 路径基检索
python
def path_retrieve(query: str, entity_a: str, entity_b: str, graph: GraphStore) -> str:
"""Find and explain paths between entities."""
with graph.driver.session() as session:
result = session.run("""
MATCH path = shortestPath(
(a:Entity {name: $entity_a})-[*..5]-(b:Entity {name: $entity_b})
)
RETURN path, length(path) as hops
ORDER BY hops
LIMIT 5
""",
entity_a=entity_a, entity_b=entity_b
)
paths = []
for record in result:
path = record["path"]
path_str = " -> ".join([node["name"] for node in path.nodes])
paths.append(path_str)
return pathspython
def path_retrieve(query: str, entity_a: str, entity_b: str, graph: GraphStore) -> str:
"""Find and explain paths between entities."""
with graph.driver.session() as session:
result = session.run("""
MATCH path = shortestPath(
(a:Entity {name: $entity_a})-[*..5]-(b:Entity {name: $entity_b})
)
RETURN path, length(path) as hops
ORDER BY hops
LIMIT 5
""",
entity_a=entity_a, entity_b=entity_b
)
paths = []
for record in result:
path = record["path"]
path_str = " -> ".join([node["name"] for node in path.nodes])
paths.append(path_str)
return paths3. Community-Based Retrieval (Microsoft GraphRAG)
3. 社区基检索(微软GraphRAG)
python
from graspologic.partition import hierarchical_leiden
def build_communities(graph: GraphStore) -> dict:
"""Detect communities for hierarchical summarization."""
# Export graph to networkx
nx_graph = graph.to_networkx()
# Detect communities at multiple levels
communities = hierarchical_leiden(nx_graph, max_cluster_size=10)
# Summarize each community
community_summaries = {}
for community_id, members in communities.items():
member_descriptions = [graph.get_entity(m)["description"] for m in members]
summary = summarize_community(member_descriptions)
community_summaries[community_id] = summary
return community_summaries
def community_retrieve(query: str, community_summaries: dict) -> list:
"""Search community summaries first, then drill down."""
# Find relevant communities
relevant = vectorstore.similarity_search(
query,
k=3,
filter={"type": "community_summary"}
)
# Get entities from those communities
entities = []
for community in relevant:
entities.extend(community.metadata["members"])
# Retrieve detailed chunks
return retrieve_by_entities(entities)python
from graspologic.partition import hierarchical_leiden
def build_communities(graph: GraphStore) -> dict:
"""Detect communities for hierarchical summarization."""
# Export graph to networkx
nx_graph = graph.to_networkx()
# Detect communities at multiple levels
communities = hierarchical_leiden(nx_graph, max_cluster_size=10)
# Summarize each community
community_summaries = {}
for community_id, members in communities.items():
member_descriptions = [graph.get_entity(m)["description"] for m in members]
summary = summarize_community(member_descriptions)
community_summaries[community_id] = summary
return community_summaries
def community_retrieve(query: str, community_summaries: dict) -> list:
"""Search community summaries first, then drill down."""
# Find relevant communities
relevant = vectorstore.similarity_search(
query,
k=3,
filter={"type": "community_summary"}
)
# Get entities from those communities
entities = []
for community in relevant:
entities.extend(community.metadata["members"])
# Retrieve detailed chunks
return retrieve_by_entities(entities)LangChain + Neo4j Integration
LangChain + Neo4j 集成
python
from langchain_community.graphs import Neo4jGraph
from langchain.chains import GraphCypherQAChainpython
from langchain_community.graphs import Neo4jGraph
from langchain.chains import GraphCypherQAChainConnect to Neo4j
Connect to Neo4j
graph = Neo4jGraph(
url="bolt://localhost:7687",
username="neo4j",
password="password"
)
graph = Neo4jGraph(
url="bolt://localhost:7687",
username="neo4j",
password="password"
)
Natural language to Cypher
Natural language to Cypher
chain = GraphCypherQAChain.from_llm(
llm=ChatOpenAI(model="gpt-4"),
graph=graph,
verbose=True,
return_intermediate_steps=True
)
chain = GraphCypherQAChain.from_llm(
llm=ChatOpenAI(model="gpt-4"),
graph=graph,
verbose=True,
return_intermediate_steps=True
)
Query in natural language
Query in natural language
result = chain.invoke({
"query": "Who are the engineers working on Project Atlas?"
})
result = chain.invoke({
"query": "Who are the engineers working on Project Atlas?"
})
Automatically generates: MATCH (p:Person)-[:WORKS_ON]->(proj:Project {name: 'Atlas'}) RETURN p
Automatically generates: MATCH (p:Person)-[:WORKS_ON]->(proj:Project {name: 'Atlas'}) RETURN p
undefinedundefinedHybrid Graph + Vector Pipeline
混合图+向量流水线
python
class GraphRAG:
def __init__(self, graph: GraphStore, vectorstore, llm):
self.graph = graph
self.vectorstore = vectorstore
self.llm = llm
def retrieve(self, query: str) -> list:
# 1. Vector search for initial chunks
vector_results = self.vectorstore.similarity_search(query, k=10)
# 2. Extract entities from results
entities = set()
for doc in vector_results:
entities.update(doc.metadata.get("entities", []))
# 3. Expand via graph
graph_context = []
for entity in list(entities)[:5]: # Limit expansion
neighbors = self.graph.get_neighbors(entity, hops=1)
for neighbor in neighbors:
graph_context.append(f"{entity} -> {neighbor['relationship']} -> {neighbor['name']}")
# 4. Combine contexts
return {
"chunks": vector_results,
"graph_context": graph_context
}
def generate(self, query: str, context: dict) -> str:
prompt = f"""Answer based on the context.
Text chunks:
{self._format_chunks(context['chunks'])}
Entity relationships:
{chr(10).join(context['graph_context'])}
Question: {query}
"""
return self.llm.invoke(prompt).contentpython
class GraphRAG:
def __init__(self, graph: GraphStore, vectorstore, llm):
self.graph = graph
self.vectorstore = vectorstore
self.llm = llm
def retrieve(self, query: str) -> list:
# 1. Vector search for initial chunks
vector_results = self.vectorstore.similarity_search(query, k=10)
# 2. Extract entities from results
entities = set()
for doc in vector_results:
entities.update(doc.metadata.get("entities", []))
# 3. Expand via graph
graph_context = []
for entity in list(entities)[:5]: # Limit expansion
neighbors = self.graph.get_neighbors(entity, hops=1)
for neighbor in neighbors:
graph_context.append(f"{entity} -> {neighbor['relationship']} -> {neighbor['name']}")
# 4. Combine contexts
return {
"chunks": vector_results,
"graph_context": graph_context
}
def generate(self, query: str, context: dict) -> str:
prompt = f"""Answer based on the context.
Text chunks:
{self._format_chunks(context['chunks'])}
Entity relationships:
{chr(10).join(context['graph_context'])}
Question: {query}
"""
return self.llm.invoke(prompt).contentBest Practices
最佳实践
- Extract consistently - use same LLM/prompt for all documents
- Normalize entities - "AWS", "Amazon Web Services" → same node
- Limit graph depth - 2-3 hops usually sufficient
- Cache traversals - graph queries can be expensive
- Combine with vectors - graph alone misses semantic similarity
- Version your schema - entity/relationship types will evolve
- 保持提取一致性 - 对所有文档使用相同的大语言模型/提示词
- 实体归一化 - 将“AWS”、“Amazon Web Services”映射为同一节点
- 限制图遍历深度 - 通常2-3跳已足够
- 缓存遍历结果 - 图查询开销较高
- 与向量检索结合 - 仅使用图检索会缺失语义相似性
- 版本化 schema - 实体/关系类型会随时间演进