Loading...
Loading...
Process multimodal inputs (images, video, audio, PDFs) with Gemini 3 Pro. Covers image understanding, video analysis, audio processing, document extraction, media resolution control, OCR, and token optimization. Use when analyzing images, processing video, transcribing audio, extracting PDF content, or working with multimodal data.
npx skill4agent add adaptationio/skrillz gemini-3-multimodalgemini-3-image-generationgemini-3-pro-apiimport google.generativeai as genai
from pathlib import Path
genai.configure(api_key="YOUR_API_KEY")
model = genai.GenerativeModel("gemini-3-pro-preview")
# Upload and analyze image
image_file = genai.upload_file(Path("photo.jpg"))
response = model.generate_content([
"What's in this image?",
image_file
])
print(response.text)import { GoogleGenerativeAI } from "@google/generative-ai";
import { GoogleAIFileManager } from "@google/generative-ai/server";
import fs from "fs";
const genAI = new GoogleGenerativeAI("YOUR_API_KEY");
const fileManager = new GoogleAIFileManager("YOUR_API_KEY");
// Upload and analyze image
const uploadResult = await fileManager.uploadFile("photo.jpg", {
mimeType: "image/jpeg"
});
const model = genAI.getGenerativeModel({ model: "gemini-3-pro-preview" });
const result = await model.generateContent([
"What's in this image?",
{ fileData: { fileUri: uploadResult.file.uri, mimeType: uploadResult.file.mimeType } }
]);
console.log(result.response.text());import google.generativeai as genai
from pathlib import Path
genai.configure(api_key="YOUR_API_KEY")
# Configure model with high resolution for best quality
model = genai.GenerativeModel(
"gemini-3-pro-preview",
generation_config={
"thinking_level": "high",
"media_resolution": "high" # 1,120 tokens per image
}
)
# Upload image
image_path = Path("screenshot.png")
image_file = genai.upload_file(image_path)
# Analyze with specific prompt
response = model.generate_content([
"""Analyze this image and provide:
1. Main objects and their locations
2. Any visible text (OCR)
3. Overall context and purpose
4. If code/UI: describe the functionality
""",
image_file
])
print(response.text)
# Check token usage
print(f"Tokens used: {response.usage_metadata.total_token_count}")import { GoogleGenerativeAI } from "@google/generative-ai";
import { GoogleAIFileManager } from "@google/generative-ai/server";
const genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
const fileManager = new GoogleAIFileManager(process.env.GEMINI_API_KEY!);
// Upload image
const uploadResult = await fileManager.uploadFile("screenshot.png", {
mimeType: "image/png"
});
// Configure model with high resolution
const model = genAI.getGenerativeModel({
model: "gemini-3-pro-preview",
generationConfig: {
thinking_level: "high",
media_resolution: "high" // Best quality for OCR
}
});
const result = await model.generateContent([
`Analyze this image and provide:
1. Main objects and their locations
2. Any visible text (OCR)
3. Overall context and purpose`,
{ fileData: { fileUri: uploadResult.file.uri, mimeType: uploadResult.file.mimeType } }
]);
console.log(result.response.text());| Resolution | Tokens per Image | Best For |
|---|---|---|
| 280 tokens | Quick analysis, low detail |
| 560 tokens | Balanced quality/cost |
| 1,120 tokens | OCR, fine details, small text |
references/image-understanding.mdimport google.generativeai as genai
from pathlib import Path
genai.configure(api_key="YOUR_API_KEY")
# Configure for video processing
model = genai.GenerativeModel(
"gemini-3-pro-preview",
generation_config={
"thinking_level": "high",
"media_resolution": "medium" # 70 tokens/frame (balanced)
}
)
# Upload video (up to 1 hour supported)
video_path = Path("tutorial.mp4")
video_file = genai.upload_file(video_path)
# Wait for processing
import time
while video_file.state.name == "PROCESSING":
time.sleep(5)
video_file = genai.get_file(video_file.name)
if video_file.state.name == "FAILED":
raise ValueError("Video processing failed")
# Analyze video
response = model.generate_content([
"""Analyze this video and provide:
1. Overall summary of content
2. Key scenes and timestamps
3. Main topics covered
4. Any visible text throughout the video
""",
video_file
])
print(response.text)
print(f"Tokens used: {response.usage_metadata.total_token_count}")import { GoogleGenerativeAI } from "@google/generative-ai";
import { GoogleAIFileManager, FileState } from "@google/generative-ai/server";
const genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
const fileManager = new GoogleAIFileManager(process.env.GEMINI_API_KEY!);
// Upload video
const uploadResult = await fileManager.uploadFile("tutorial.mp4", {
mimeType: "video/mp4"
});
// Wait for processing
let file = await fileManager.getFile(uploadResult.file.name);
while (file.state === FileState.PROCESSING) {
await new Promise(resolve => setTimeout(resolve, 5000));
file = await fileManager.getFile(uploadResult.file.name);
}
if (file.state === FileState.FAILED) {
throw new Error("Video processing failed");
}
// Analyze video
const model = genAI.getGenerativeModel({
model: "gemini-3-pro-preview",
generationConfig: {
media_resolution: "medium"
}
});
const result = await model.generateContent([
`Analyze this video and provide:
1. Overall summary
2. Key scenes and timestamps
3. Main topics covered`,
{ fileData: { fileUri: file.uri, mimeType: file.mimeType } }
]);
console.log(result.response.text());references/video-processing.mdimport google.generativeai as genai
from pathlib import Path
genai.configure(api_key="YOUR_API_KEY")
model = genai.GenerativeModel("gemini-3-pro-preview")
# Upload audio file (up to 9.5 hours supported)
audio_path = Path("podcast.mp3")
audio_file = genai.upload_file(audio_path)
# Wait for processing
import time
while audio_file.state.name == "PROCESSING":
time.sleep(5)
audio_file = genai.get_file(audio_file.name)
# Process audio
response = model.generate_content([
"""Process this audio and provide:
1. Full transcription
2. Summary of main points
3. Key speakers (if multiple)
4. Important timestamps
5. Action items or conclusions
""",
audio_file
])
print(response.text)
print(f"Tokens used: {response.usage_metadata.total_token_count}")import { GoogleGenerativeAI } from "@google/generative-ai";
import { GoogleAIFileManager, FileState } from "@google/generative-ai/server";
const genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
const fileManager = new GoogleAIFileManager(process.env.GEMINI_API_KEY!);
// Upload audio
const uploadResult = await fileManager.uploadFile("podcast.mp3", {
mimeType: "audio/mp3"
});
// Wait for processing
let file = await fileManager.getFile(uploadResult.file.name);
while (file.state === FileState.PROCESSING) {
await new Promise(resolve => setTimeout(resolve, 5000));
file = await fileManager.getFile(uploadResult.file.name);
}
const model = genAI.getGenerativeModel({ model: "gemini-3-pro-preview" });
const result = await model.generateContent([
`Process this audio and provide:
1. Full transcription
2. Summary of main points
3. Key timestamps`,
{ fileData: { fileUri: file.uri, mimeType: file.mimeType } }
]);
console.log(result.response.text());references/audio-processing.mdimport google.generativeai as genai
from pathlib import Path
genai.configure(api_key="YOUR_API_KEY")
# Configure with medium resolution (recommended for PDFs)
model = genai.GenerativeModel(
"gemini-3-pro-preview",
generation_config={
"thinking_level": "high",
"media_resolution": "medium" # 560 tokens/page (saturation point)
}
)
# Upload PDF
pdf_path = Path("research_paper.pdf")
pdf_file = genai.upload_file(pdf_path)
# Wait for processing
import time
while pdf_file.state.name == "PROCESSING":
time.sleep(5)
pdf_file = genai.get_file(pdf_file.name)
# Analyze PDF
response = model.generate_content([
"""Analyze this PDF document and provide:
1. Document type and purpose
2. Main sections and structure
3. Key findings or arguments
4. Important data or statistics
5. Conclusions or recommendations
""",
pdf_file
])
print(response.text)
print(f"Tokens used: {response.usage_metadata.total_token_count}")import { GoogleGenerativeAI } from "@google/generative-ai";
import { GoogleAIFileManager, FileState } from "@google/generative-ai/server";
const genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
const fileManager = new GoogleAIFileManager(process.env.GEMINI_API_KEY!);
// Upload PDF
const uploadResult = await fileManager.uploadFile("research_paper.pdf", {
mimeType: "application/pdf"
});
// Wait for processing
let file = await fileManager.getFile(uploadResult.file.name);
while (file.state === FileState.PROCESSING) {
await new Promise(resolve => setTimeout(resolve, 5000));
file = await fileManager.getFile(uploadResult.file.name);
}
// Analyze with medium resolution (recommended)
const model = genAI.getGenerativeModel({
model: "gemini-3-pro-preview",
generationConfig: {
media_resolution: "medium"
}
});
const result = await model.generateContent([
`Analyze this PDF and extract:
1. Main sections
2. Key findings
3. Important data`,
{ fileData: { fileUri: file.uri, mimeType: file.mimeType } }
]);
console.log(result.response.text());mediumreferences/document-processing.md| Media Type | Resolution | Tokens | Use Case |
|---|---|---|---|
| Images | | 280 | Quick scan, thumbnails |
| Images | | 560 | General analysis |
| Images | | 1,120 | OCR, fine details, code |
| PDFs | | 560/page | Recommended (saturation point) |
| PDFs | | 1,120/page | Diminishing returns |
| Video | | 70/frame | Most use cases |
| Video | | 280/frame | OCR from video |
import google.generativeai as genai
genai.configure(api_key="YOUR_API_KEY")
# Different resolutions for different use cases
def analyze_image_optimized(image_path, need_ocr=False):
"""Analyze image with appropriate resolution"""
resolution = "high" if need_ocr else "medium"
model = genai.GenerativeModel(
"gemini-3-pro-preview",
generation_config={
"media_resolution": resolution
}
)
image_file = genai.upload_file(image_path)
response = model.generate_content([
"Describe this image" if not need_ocr else "Extract all text from this image",
image_file
])
# Log token usage for cost tracking
tokens = response.usage_metadata.total_token_count
cost = (tokens / 1_000_000) * 2.00 # Input pricing
print(f"Resolution: {resolution}, Tokens: {tokens}, Cost: ${cost:.6f}")
return response.text
# Use appropriate resolution
analyze_image_optimized("photo.jpg", need_ocr=False) # medium
analyze_image_optimized("document.png", need_ocr=True) # high# Set different resolutions for different media in same request
response = model.generate_content([
"Compare these images",
{"file": image1, "media_resolution": "high"}, # High detail
{"file": image2, "media_resolution": "low"}, # Low detail OK
])def log_media_costs(response):
"""Log media processing costs"""
usage = response.usage_metadata
# Pricing for ≤200k context
input_cost = (usage.prompt_token_count / 1_000_000) * 2.00
output_cost = (usage.candidates_token_count / 1_000_000) * 12.00
print(f"Input tokens: {usage.prompt_token_count} (${input_cost:.6f})")
print(f"Output tokens: {usage.candidates_token_count} (${output_cost:.6f})")
print(f"Total cost: ${input_cost + output_cost:.6f}")references/token-optimization.md| Setting | Images | PDFs | Video (per frame) | Recommendation |
|---|---|---|---|---|
| 280 tokens | 280 tokens | 70 tokens | Quick analysis, low detail |
| 560 tokens | 560 tokens | 70 tokens | Balanced quality/cost |
| 1,120 tokens | 1,120 tokens | 280 tokens | OCR, fine text, details |
model = genai.GenerativeModel(
"gemini-3-pro-preview",
generation_config={
"media_resolution": "high" # Applies to all media
}
)response = model.generate_content([
"Analyze these files",
{"file": high_detail_image, "media_resolution": "high"},
{"file": low_detail_image, "media_resolution": "low"}
])highmediummediumlowmediumlowreferences/media-resolution.mdimport google.generativeai as genai
# Upload file
file = genai.upload_file("path/to/file.jpg")
print(f"Uploaded: {file.name}")
# Check processing status
while file.state.name == "PROCESSING":
time.sleep(5)
file = genai.get_file(file.name)
print(f"Status: {file.state.name}")# List all files
for file in genai.list_files():
print(f"{file.name} - {file.display_name}")# Delete specific file
genai.delete_file(file.name)
# Delete all files
for file in genai.list_files():
genai.delete_file(file.name)
print(f"Deleted: {file.name}")# Upload multiple images
images = [
genai.upload_file("photo1.jpg"),
genai.upload_file("photo2.jpg"),
genai.upload_file("photo3.jpg")
]
# Analyze together
response = model.generate_content([
"Compare these images and identify common elements",
*images
])
print(response.text)# Combine different media types
image = genai.upload_file("chart.png")
pdf = genai.upload_file("report.pdf")
response = model.generate_content([
"Does the chart match the data in the report?",
image,
pdf
])model = genai.GenerativeModel("gemini-3-pro-preview")
chat = model.start_chat()
# Upload image
image = genai.upload_file("product.jpg")
# Ask questions about it
response1 = chat.send_message(["What product is this?", image])
response2 = chat.send_message("What are its main features?")
response3 = chat.send_message("What's the price range for similar products?")import google.generativeai as genai
from pathlib import Path
genai.configure(api_key="YOUR_API_KEY")
model = genai.GenerativeModel(
"gemini-3-pro-preview",
generation_config={"media_resolution": "medium"}
)
# Process all PDFs in directory
pdf_dir = Path("documents/")
results = {}
for pdf_path in pdf_dir.glob("*.pdf"):
pdf_file = genai.upload_file(pdf_path)
# Wait for processing
while pdf_file.state.name == "PROCESSING":
time.sleep(5)
pdf_file = genai.get_file(pdf_file.name)
# Extract key information
response = model.generate_content([
"Extract: 1) Document type, 2) Key dates, 3) Important numbers, 4) Summary",
pdf_file
])
results[pdf_path.name] = response.text
# Clean up
genai.delete_file(pdf_file.name)
# Save results
import json
with open("analysis_results.json", "w") as f:
json.dump(results, f, indent=2)video = genai.upload_file("user_upload.mp4")
# Wait for processing
while video.state.name == "PROCESSING":
time.sleep(10)
video = genai.get_file(video.name)
response = model.generate_content([
"""Analyze this video for:
1. Inappropriate content (yes/no)
2. Violence or harmful content (yes/no)
3. Overall content rating (G/PG/PG-13/R)
4. Brief justification
Provide structured response.
""",
video
])
print(response.text)media_resolution: "high"lowmediummedia_resolution: "high"