Loading...
Loading...
Compare original and translation side by side
pip install transformers torchfrom transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax
model_id = "meta-llama/Prompt-Guard-86M"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id)
model.eval()
def get_jailbreak_score(text):
"""Check user input for jailbreak attempts."""
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
logits = model(**inputs).logits
probs = softmax(logits, dim=-1)
return probs[0, 2].item() # Jailbreak probabilitypip install transformers torchfrom transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax
model_id = "meta-llama/Prompt-Guard-86M"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id)
model.eval()
def get_jailbreak_score(text):
"""检查用户输入是否存在越狱尝试。"""
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
logits = model(**inputs).logits
probs = softmax(logits, dim=-1)
return probs[0, 2].item() # 越狱概率
**Classification labels**:
- **BENIGN** (label 0): Normal content
- **INJECTION** (label 1): Embedded instructions in data
- **JAILBREAK** (label 2): Direct override attempts
**分类标签**:
- **良性内容** (标签0): 正常内容
- **提示注入** (标签1): 数据中嵌入的指令
- **越狱尝试** (标签2): 直接绕过安全限制的尝试def filter_user_input(user_message, threshold=0.5):
"""
Filter user input for jailbreak attempts.
Returns: (is_safe, score, message)
"""
score = get_jailbreak_score(user_message)
if score >= threshold:
return False, score, "Input blocked: jailbreak attempt"
else:
return True, score, "Input safe"def filter_user_input(user_message, threshold=0.5):
"""
过滤用户输入中的越狱尝试。
返回: (是否安全, 分数, 提示信息)
"""
score = get_jailbreak_score(user_message)
if score >= threshold:
return False, score, "输入被拦截:存在越狱尝试"
else:
return True, score, "输入安全"
**Common jailbreak patterns detected**:
```python
**可检测的常见越狱模式**:
```pythonundefinedundefineddef get_indirect_injection_score(text):
"""Check third-party data for embedded instructions."""
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
logits = model(**inputs).logits
probs = softmax(logits, dim=-1)
# Sum INJECTION + JAILBREAK probabilities
return (probs[0, 1] + probs[0, 2]).item()
def filter_third_party_data(data, threshold=0.3):
"""
Filter third-party data (API responses, web scraping, RAG docs).
Use lower threshold (0.3) for third-party data.
"""
score = get_indirect_injection_score(data)
if score >= threshold:
return False, score, "Data blocked: suspected injection"
else:
return True, score, "Data safe"def get_indirect_injection_score(text):
"""检查第三方数据中是否存在嵌入的指令。"""
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
logits = model(**inputs).logits
probs = softmax(logits, dim=-1)
# 计算提示注入 + 越狱尝试的概率总和
return (probs[0, 1] + probs[0, 2]).item()
def filter_third_party_data(data, threshold=0.3):
"""
过滤第三方数据(API响应、网页爬取内容、RAG文档)。
针对第三方数据使用更低的阈值(0.3)。
"""
score = get_indirect_injection_score(data)
if score >= threshold:
return False, score, "数据被拦截:疑似存在注入"
else:
return True, score, "数据安全"
**Common injection patterns detected**:
```python
**可检测的常见注入模式**:
```pythonundefinedundefineddef batch_filter_documents(documents, threshold=0.3, batch_size=32):
"""
Batch filter documents for prompt injections.
Args:
documents: List of document strings
threshold: Detection threshold (default 0.3)
batch_size: Batch size for processing
Returns:
List of (doc, score, is_safe) tuples
"""
results = []
for i in range(0, len(documents), batch_size):
batch = documents[i:i + batch_size]
# Tokenize batch
inputs = tokenizer(
batch,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
with torch.no_grad():
logits = model(**inputs).logits
probs = softmax(logits, dim=-1)
# Injection scores (labels 1 + 2)
scores = (probs[:, 1] + probs[:, 2]).tolist()
for doc, score in zip(batch, scores):
is_safe = score < threshold
results.append((doc, score, is_safe))
return resultsdef batch_filter_documents(documents, threshold=0.3, batch_size=32):
"""
批量过滤文档中的提示注入。
参数:
documents: 文档字符串列表
threshold: 检测阈值(默认0.3)
batch_size: 处理批次大小
返回:
(文档, 分数, 是否安全) 元组列表
"""
results = []
for i in range(0, len(documents), batch_size):
batch = documents[i:i + batch_size]
# 对批次进行分词
inputs = tokenizer(
batch,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
with torch.no_grad():
logits = model(**inputs).logits
probs = softmax(logits, dim=-1)
# 注入分数(标签1 + 标签2)
scores = (probs[:, 1] + probs[:, 2]).tolist()
for doc, score in zip(batch, scores):
is_safe = score < threshold
results.append((doc, score, is_safe))
return resultsundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefined
**Solution**: Context-aware filtering with user reputation:
```python
def filter_with_context(text, user_is_trusted):
score = get_jailbreak_score(text)
# Higher threshold for trusted users
threshold = 0.7 if user_is_trusted else 0.5
return score < thresholdundefined
**解决方案**: 结合用户信誉的上下文感知过滤:
```python
def filter_with_context(text, user_is_trusted):
score = get_jailbreak_score(text)
# 为可信用户设置更高的阈值
threshold = 0.7 if user_is_trusted else 0.5
return score < thresholdundefined
**Solution**: Sliding window with overlapping chunks:
```python
def score_long_text(text, chunk_size=512, overlap=256):
"""Score long texts with sliding window."""
tokens = tokenizer.encode(text)
max_score = 0.0
for i in range(0, len(tokens), chunk_size - overlap):
chunk = tokens[i:i + chunk_size]
chunk_text = tokenizer.decode(chunk)
score = get_jailbreak_score(chunk_text)
max_score = max(max_score, score)
return max_score
**解决方案**: 使用带重叠块的滑动窗口:
```python
def score_long_text(text, chunk_size=512, overlap=256):
"""使用滑动窗口为长文本打分。"""
tokens = tokenizer.encode(text)
max_score = 0.0
for i in range(0, len(tokens), chunk_size - overlap):
chunk = tokens[i:i + chunk_size]
chunk_text = tokenizer.decode(chunk)
score = get_jailbreak_score(chunk_text)
max_score = max(max_score, score)
return max_score| Application Type | Threshold | TPR | FPR | Use Case |
|---|---|---|---|---|
| High Security | 0.3 | 98.5% | 5.2% | Banking, healthcare, government |
| Balanced | 0.5 | 95.7% | 2.1% | Enterprise SaaS, chatbots |
| Low Friction | 0.7 | 88.3% | 0.8% | Creative tools, research |
| 应用类型 | 阈值 | 真阳率 | 假阳率 | 使用场景 |
|---|---|---|---|---|
| 高安全要求 | 0.3 | 98.5% | 5.2% | 银行业、医疗保健、政府部门 |
| 平衡型 | 0.5 | 95.7% | 2.1% | 企业SaaS、聊天机器人 |
| 低摩擦型 | 0.7 | 88.3% | 0.8% | 创意工具、研究场景 |