Loading...
Loading...
Expert skill for implementing text-to-speech with Kokoro TTS. Covers voice synthesis, audio generation, performance optimization, and secure handling of generated audio for JARVIS voice assistant.
npx skill4agent add martinholovsky/claude-skills-generator text-to-speechFile Organization: Split structure. Seefor detailed implementations.references/
# tests/test_tts_engine.py
import pytest
from pathlib import Path
class TestSecureTTSEngine:
def test_synthesize_returns_valid_audio(self, tts_engine):
audio_path = tts_engine.synthesize("Hello test")
assert Path(audio_path).exists()
assert audio_path.endswith('.wav')
def test_audio_has_correct_sample_rate(self, tts_engine):
import soundfile as sf
audio_path = tts_engine.synthesize("Test")
_, sample_rate = sf.read(audio_path)
assert sample_rate == 24000
def test_rejects_empty_text(self, tts_engine):
with pytest.raises(ValidationError):
tts_engine.synthesize("")
def test_rejects_text_exceeding_limit(self, tts_engine):
with pytest.raises(ValidationError):
tts_engine.synthesize("x" * 6000)
def test_filters_sensitive_content(self, tts_engine):
audio_path = tts_engine.synthesize("password: secret123")
assert Path(audio_path).exists()
def test_cleanup_removes_temp_files(self, tts_engine):
tts_engine.synthesize("Test")
temp_dir = tts_engine.temp_dir
tts_engine.cleanup()
assert not Path(temp_dir).exists()
@pytest.fixture
def tts_engine():
from jarvis.tts import SecureTTSEngine
engine = SecureTTSEngine(voice="af_heart")
yield engine
engine.cleanup()pytest tests/test_tts_engine.py -v # Run tests
pytest --cov=jarvis.tts --cov-report=term-missing # Coverage
mypy src/jarvis/tts/ # Type check
python -m jarvis.tts --test "Hello JARVIS" # Integration# BAD - Wait for full audio
audio_chunks = []
for _, _, audio in pipeline(text):
audio_chunks.append(audio)
play_audio(np.concatenate(audio_chunks)) # Long wait
# GOOD - Stream chunks immediately
with sd.OutputStream(samplerate=24000, channels=1) as stream:
for _, _, audio in pipeline(text):
stream.write(audio) # Play as generated# BAD: pipeline = KPipeline(lang_code="a") # Reload each time
# GOOD - Singleton pattern
class TTSEngine:
_pipeline = None
@classmethod
def get_pipeline(cls):
if cls._pipeline is None:
cls._pipeline = KPipeline(lang_code="a")
return cls._pipeline# BAD: data, sr = sf.read(audio_path) # Full file in RAM
# GOOD - Process in chunks
with sf.SoundFile(audio_path) as f:
while f.tell() < len(f):
yield process(f.read(24000))# BAD: audio = engine.synthesize(text) # Blocks event loop
# GOOD - Run in executor
audio = await loop.run_in_executor(None, engine.synthesize, text)# BAD: return SecureTTSEngine(voice=VOICES[voice_type]) # Cold start
# GOOD - Preload at startup
def _preload_voices(self, types: list[str]):
for t in types:
self.engines[t] = SecureTTSEngine(voice=VOICES[t])| Use Case | Version | Notes |
|---|---|---|
| Production | kokoro>=0.3.0 | Latest stable |
# requirements.txt
kokoro>=0.3.0
numpy>=1.24.0
soundfile>=0.12.0
sounddevice>=0.4.6
scipy>=1.10.0
pydantic>=2.0
structlog>=23.0| Voice | Style | Use Case |
|---|---|---|
| af_heart | Warm, friendly | Default JARVIS |
| af_bella | Professional | Formal responses |
| am_adam | Male | Alternative voice |
| bf_emma | British | Accent variation |
from kokoro import KPipeline
import soundfile as sf
import numpy as np
from pathlib import Path
import tempfile
import os
import structlog
logger = structlog.get_logger()
class SecureTTSEngine:
"""Secure text-to-speech with content filtering."""
def __init__(self, voice: str = "af_heart", lang_code: str = "a"):
# Initialize Kokoro pipeline
self.pipeline = KPipeline(lang_code=lang_code)
self.voice = voice
# Content filter patterns
self.blocked_patterns = [
r"password\s*[:=]",
r"api[_-]?key\s*[:=]",
r"secret\s*[:=]",
]
# Create secure temp directory
self.temp_dir = tempfile.mkdtemp(prefix="jarvis_tts_")
os.chmod(self.temp_dir, 0o700)
logger.info("tts.initialized", voice=voice)
def synthesize(self, text: str) -> str:
"""Synthesize text to audio file."""
# Validate and filter input
if not self._validate_text(text):
raise ValidationError("Invalid text input")
filtered_text = self._filter_sensitive(text)
# Generate audio
audio_path = Path(self.temp_dir) / f"{uuid.uuid4()}.wav"
generator = self.pipeline(
filtered_text,
voice=self.voice,
speed=1.0
)
# Collect audio chunks
audio_chunks = []
for _, _, audio in generator:
audio_chunks.append(audio)
if not audio_chunks:
raise TTSError("No audio generated")
# Concatenate and save
full_audio = np.concatenate(audio_chunks)
sf.write(str(audio_path), full_audio, 24000)
logger.info("tts.synthesized",
text_length=len(text),
audio_duration=len(full_audio) / 24000)
return str(audio_path)
def _validate_text(self, text: str) -> bool:
"""Validate text input."""
if not text or not text.strip():
return False
# Length limit (prevent DoS)
if len(text) > 5000:
logger.warning("tts.text_too_long", length=len(text))
return False
return True
def _filter_sensitive(self, text: str) -> str:
"""Filter sensitive content from text."""
import re
filtered = text
for pattern in self.blocked_patterns:
if re.search(pattern, filtered, re.IGNORECASE):
logger.warning("tts.sensitive_content_filtered")
filtered = re.sub(pattern + r'\S+', '[FILTERED]', filtered, flags=re.IGNORECASE)
return filtered
def cleanup(self):
"""Clean up temp files."""
import shutil
if os.path.exists(self.temp_dir):
shutil.rmtree(self.temp_dir)# Stream audio chunks as generated for low latency
with sd.OutputStream(samplerate=24000, channels=1) as stream:
for _, _, audio in pipeline(text, voice=voice):
stream.write(audio) # Play immediately# Cache common phrases with hash key
cache_key = hashlib.sha256(f"{text}:{voice}".encode()).hexdigest()
cache_path = cache_dir / f"{cache_key}.wav"
if cache_path.exists():
return str(cache_path) # Cache hit
# Generate, save to cache, return path# Lazy-load engines per voice type
VOICES = {"default": "af_heart", "formal": "af_bella"}
def get_engine(voice_type: str) -> SecureTTSEngine:
if voice_type not in engines:
engines[voice_type] = SecureTTSEngine(voice=VOICES[voice_type])
return engines[voice_type]# Semaphore for concurrency + timeout for protection
async with asyncio.Semaphore(2):
result = await asyncio.wait_for(
loop.run_in_executor(None, engine.synthesize, text),
timeout=30.0
)class ContentFilter:
"""Filter inappropriate content before synthesis."""
BLOCKED_CATEGORIES = [
"violence",
"hate_speech",
"explicit",
]
def filter(self, text: str) -> tuple[str, bool]:
"""Filter text and return (filtered_text, was_modified)."""
# Remove potential command injection
text = text.replace(";", "").replace("|", "").replace("&", "")
# Check for blocked patterns
for pattern in self.blocked_patterns:
if re.search(pattern, text, re.IGNORECASE):
return "[Content filtered]", True
return text, Falsedef validate_tts_input(text: str) -> bool:
"""Validate text for TTS synthesis."""
# Length limit
if len(text) > 5000:
raise ValidationError("Text too long (max 5000 chars)")
# Character validation
if not all(c.isprintable() or c in '\n\t' for c in text):
raise ValidationError("Invalid characters in text")
return True# BAD - No filtering
def speak(user_input: str):
engine.synthesize(user_input)
# GOOD - Filter first
def speak(user_input: str):
filtered = content_filter.filter(user_input)
engine.synthesize(filtered)# BAD - Can generate very long audio
engine.synthesize(long_text) # No limit
# GOOD - Enforce limits
if len(text) > 5000:
raise ValidationError("Text too long")
engine.synthesize(text)pytest tests/test_tts_engine.py -vpytest --cov=jarvis.ttsmypy src/jarvis/tts/python -m jarvis.tts --test