Transcription reference¶

audiotext.transcriber ¶

TranscriptionOptions `dataclass` ¶

Source code in src/audiotext/transcriber.py

@dataclass(frozen=True)
class TranscriptionOptions:
    language: Language = "auto"
    vad_filter: bool = True
    beam_size: int = 5
    best_of: int = 5
    temperature: Temperature = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0)
    initial_prompt: str | None = None
    without_timestamps: bool = False
    condition_on_previous_text: bool = True
    word_timestamps: bool = False
    max_new_tokens: int | None = None
    chunk_length: int | None = None
    language_detection_segments: int = 1
    language_detection_threshold: float = 0.5
    no_speech_threshold: float = 0.6
    vad_min_silence_duration_ms: int = 2000
    vad_speech_pad_ms: int = 400
    repetition_penalty: float = 1.0
    no_repeat_ngram_size: int = 0
    hotwords: str | None = None

language `class-attribute` `instance-attribute` ¶

language = 'auto'

vad_filter `class-attribute` `instance-attribute` ¶

vad_filter = True

beam_size `class-attribute` `instance-attribute` ¶

beam_size = 5

best_of `class-attribute` `instance-attribute` ¶

best_of = 5

temperature `class-attribute` `instance-attribute` ¶

temperature = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0)

initial_prompt `class-attribute` `instance-attribute` ¶

initial_prompt = None

without_timestamps `class-attribute` `instance-attribute` ¶

without_timestamps = False

condition_on_previous_text `class-attribute` `instance-attribute` ¶

condition_on_previous_text = True

word_timestamps `class-attribute` `instance-attribute` ¶

word_timestamps = False

max_new_tokens `class-attribute` `instance-attribute` ¶

max_new_tokens = None

chunk_length `class-attribute` `instance-attribute` ¶

chunk_length = None

language_detection_segments `class-attribute` `instance-attribute` ¶

language_detection_segments = 1

language_detection_threshold `class-attribute` `instance-attribute` ¶

language_detection_threshold = 0.5

no_speech_threshold `class-attribute` `instance-attribute` ¶

no_speech_threshold = 0.6

vad_min_silence_duration_ms `class-attribute` `instance-attribute` ¶

vad_min_silence_duration_ms = 2000

vad_speech_pad_ms `class-attribute` `instance-attribute` ¶

vad_speech_pad_ms = 400

repetition_penalty `class-attribute` `instance-attribute` ¶

repetition_penalty = 1.0

no_repeat_ngram_size `class-attribute` `instance-attribute` ¶

no_repeat_ngram_size = 0

hotwords `class-attribute` `instance-attribute` ¶

hotwords = None

init ¶

__init__(
    language="auto",
    vad_filter=True,
    beam_size=5,
    best_of=5,
    temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
    initial_prompt=None,
    without_timestamps=False,
    condition_on_previous_text=True,
    word_timestamps=False,
    max_new_tokens=None,
    chunk_length=None,
    language_detection_segments=1,
    language_detection_threshold=0.5,
    no_speech_threshold=0.6,
    vad_min_silence_duration_ms=2000,
    vad_speech_pad_ms=400,
    repetition_penalty=1.0,
    no_repeat_ngram_size=0,
    hotwords=None,
)

Transcriber ¶

Bases: Protocol

Source code in src/audiotext/transcriber.py

class Transcriber(Protocol):
    model_name: str
    device: str

    def transcribe(
        self,
        audio_path: str | PathLike[str],
        options: TranscriptionOptions | None = None,
    ) -> TranscriptionResult:
        ...

model_name `instance-attribute` ¶

model_name

device `instance-attribute` ¶

device

transcribe ¶

transcribe(audio_path, options=None)

Source code in src/audiotext/transcriber.py

def transcribe(
    self,
    audio_path: str | PathLike[str],
    options: TranscriptionOptions | None = None,
) -> TranscriptionResult:
    ...

FasterWhisperTranscriber ¶

Thin wrapper around faster-whisper with project-safe defaults.

Source code in src/audiotext/transcriber.py

class FasterWhisperTranscriber:
    """Thin wrapper around faster-whisper with project-safe defaults."""

    def __init__(
        self,
        model: str,
        *,
        device: str = "cpu",
        compute_type: str = "int8",
        cpu_threads: int = 0,
        num_workers: int = 1,
    ) -> None:
        try:
            from faster_whisper import WhisperModel
        except ImportError as exc:
            raise RuntimeError(
                "faster-whisper is not installed. Install with: "
                'uv pip install -e ".[faster-whisper]"'
            ) from exc

        self.model_name = model
        self.device = device
        self.compute_type = compute_type
        self._model = WhisperModel(
            model,
            device=device,
            compute_type=compute_type,
            cpu_threads=cpu_threads,
            num_workers=num_workers,
        )

    @classmethod
    def from_preset(cls, preset_name: str = "cpu-lite", **overrides: object) -> "FasterWhisperTranscriber":
        preset = get_preset(preset_name)
        return cls.from_model_preset(preset, **overrides)

    @classmethod
    def from_model_preset(cls, preset: ModelPreset, **overrides: object) -> "FasterWhisperTranscriber":
        config = {
            "model": preset.model,
            "device": preset.device,
            "compute_type": preset.compute_type,
        }
        config.update(overrides)
        return cls(**config)

    def transcribe(
        self,
        audio_path: str | PathLike[str],
        options: TranscriptionOptions | None = None,
    ) -> TranscriptionResult:
        options = options or TranscriptionOptions()
        if options.beam_size < 1:
            raise ValueError("beam_size must be >= 1")

        language = None if options.language == "auto" else options.language
        vad_parameters = None
        if options.vad_filter:
            vad_parameters = {
                "min_silence_duration_ms": options.vad_min_silence_duration_ms,
                "speech_pad_ms": options.vad_speech_pad_ms,
            }
        segments_iter, info = self._model.transcribe(
            str(audio_path),
            language=language,
            vad_filter=options.vad_filter,
            vad_parameters=vad_parameters,
            beam_size=options.beam_size,
            best_of=options.best_of,
            temperature=options.temperature,
            initial_prompt=options.initial_prompt,
            without_timestamps=options.without_timestamps,
            condition_on_previous_text=options.condition_on_previous_text,
            word_timestamps=options.word_timestamps and not options.without_timestamps,
            max_new_tokens=options.max_new_tokens,
            chunk_length=options.chunk_length,
            language_detection_segments=options.language_detection_segments,
            language_detection_threshold=options.language_detection_threshold,
            no_speech_threshold=options.no_speech_threshold,
            repetition_penalty=options.repetition_penalty,
            no_repeat_ngram_size=options.no_repeat_ngram_size,
            hotwords=options.hotwords,
        )
        segments = tuple(
            Segment(start=segment.start, end=segment.end, text=segment.text)
            for segment in segments_iter
        )
        text = "".join(segment.text for segment in segments).strip()
        return TranscriptionResult(
            text=text,
            language=info.language,
            language_probability=info.language_probability,
            duration=getattr(info, "duration", None),
            segments=segments,
        )

model_name `instance-attribute` ¶

model_name = model

device `instance-attribute` ¶

device = device

compute_type `instance-attribute` ¶

compute_type = compute_type

init ¶

__init__(
    model,
    *,
    device="cpu",
    compute_type="int8",
    cpu_threads=0,
    num_workers=1,
)

Source code in src/audiotext/transcriber.py

def __init__(
    self,
    model: str,
    *,
    device: str = "cpu",
    compute_type: str = "int8",
    cpu_threads: int = 0,
    num_workers: int = 1,
) -> None:
    try:
        from faster_whisper import WhisperModel
    except ImportError as exc:
        raise RuntimeError(
            "faster-whisper is not installed. Install with: "
            'uv pip install -e ".[faster-whisper]"'
        ) from exc

    self.model_name = model
    self.device = device
    self.compute_type = compute_type
    self._model = WhisperModel(
        model,
        device=device,
        compute_type=compute_type,
        cpu_threads=cpu_threads,
        num_workers=num_workers,
    )

from_preset `classmethod` ¶

from_preset(preset_name='cpu-lite', **overrides)

Source code in src/audiotext/transcriber.py

@classmethod
def from_preset(cls, preset_name: str = "cpu-lite", **overrides: object) -> "FasterWhisperTranscriber":
    preset = get_preset(preset_name)
    return cls.from_model_preset(preset, **overrides)

from_model_preset `classmethod` ¶

from_model_preset(preset, **overrides)

Source code in src/audiotext/transcriber.py

@classmethod
def from_model_preset(cls, preset: ModelPreset, **overrides: object) -> "FasterWhisperTranscriber":
    config = {
        "model": preset.model,
        "device": preset.device,
        "compute_type": preset.compute_type,
    }
    config.update(overrides)
    return cls(**config)

transcribe ¶

transcribe(audio_path, options=None)

Source code in src/audiotext/transcriber.py

def transcribe(
    self,
    audio_path: str | PathLike[str],
    options: TranscriptionOptions | None = None,
) -> TranscriptionResult:
    options = options or TranscriptionOptions()
    if options.beam_size < 1:
        raise ValueError("beam_size must be >= 1")

    language = None if options.language == "auto" else options.language
    vad_parameters = None
    if options.vad_filter:
        vad_parameters = {
            "min_silence_duration_ms": options.vad_min_silence_duration_ms,
            "speech_pad_ms": options.vad_speech_pad_ms,
        }
    segments_iter, info = self._model.transcribe(
        str(audio_path),
        language=language,
        vad_filter=options.vad_filter,
        vad_parameters=vad_parameters,
        beam_size=options.beam_size,
        best_of=options.best_of,
        temperature=options.temperature,
        initial_prompt=options.initial_prompt,
        without_timestamps=options.without_timestamps,
        condition_on_previous_text=options.condition_on_previous_text,
        word_timestamps=options.word_timestamps and not options.without_timestamps,
        max_new_tokens=options.max_new_tokens,
        chunk_length=options.chunk_length,
        language_detection_segments=options.language_detection_segments,
        language_detection_threshold=options.language_detection_threshold,
        no_speech_threshold=options.no_speech_threshold,
        repetition_penalty=options.repetition_penalty,
        no_repeat_ngram_size=options.no_repeat_ngram_size,
        hotwords=options.hotwords,
    )
    segments = tuple(
        Segment(start=segment.start, end=segment.end, text=segment.text)
        for segment in segments_iter
    )
    text = "".join(segment.text for segment in segments).strip()
    return TranscriptionResult(
        text=text,
        language=info.language,
        language_probability=info.language_probability,
        duration=getattr(info, "duration", None),
        segments=segments,
    )

backend_features ¶

backend_features(backend)

Source code in src/audiotext/transcriber.py

def backend_features(backend: str) -> dict[str, bool]:
    if backend == "faster-whisper":
        return {
            "transcription": True,
            "translation": False,
            "language_detection": True,
            "timestamps": True,
            "word_timestamps": True,
            "streaming": False,
            "diarization": False,
        }
    raise ValueError(f"Unsupported backend: {backend}")

normalize_options ¶

normalize_options(*, defaults=None, overrides=None)

Source code in src/audiotext/transcriber.py

def normalize_options(
    *,
    defaults: Mapping[str, object] | None = None,
    overrides: Mapping[str, object | None] | None = None,
) -> TranscriptionOptions:
    data = {**FAST_TRANSCRIPTION_DEFAULTS, **dict(defaults or {})}
    data.update({key: value for key, value in dict(overrides or {}).items() if value is not None})
    language = str(data.get("language", "auto"))
    if language not in {"auto", "en", "es", "ca"}:
        raise ValueError("language must be auto, en, es, or ca")
    beam_size = int(data.get("beam_size", 1))
    best_of = int(data.get("best_of", 1))
    if beam_size < 1:
        raise ValueError("beam_size must be >= 1")
    if best_of < 1:
        raise ValueError("best_of must be >= 1")
    return TranscriptionOptions(
        language=language,  # type: ignore[arg-type]
        beam_size=beam_size,
        best_of=best_of,
        vad_filter=bool(data.get("vad_filter", True)),
        temperature=data.get("temperature", 0.0),  # type: ignore[arg-type]
        initial_prompt=data.get("initial_prompt") if isinstance(data.get("initial_prompt"), str) else None,
        without_timestamps=bool(data.get("without_timestamps", False)),
        condition_on_previous_text=bool(data.get("condition_on_previous_text", False)),
        word_timestamps=bool(data.get("word_timestamps", False)),
        no_speech_threshold=float(data.get("no_speech_threshold", 0.6)),
        vad_min_silence_duration_ms=int(data.get("vad_min_silence_duration_ms", 2000)),
        vad_speech_pad_ms=int(data.get("vad_speech_pad_ms", 400)),
    )

create_transcriber_from_preset ¶

create_transcriber_from_preset(preset, **overrides)

Source code in src/audiotext/transcriber.py

def create_transcriber_from_preset(preset: ModelPreset, **overrides: object) -> Transcriber:
    if preset.backend == "faster-whisper":
        return FasterWhisperTranscriber.from_model_preset(preset, **overrides)

    raise ValueError(f"Unsupported backend: {preset.backend}")

Transcription reference¶

audiotext.transcriber ¶

TranscriptionOptions dataclass ¶

language class-attribute instance-attribute ¶

vad_filter class-attribute instance-attribute ¶

beam_size class-attribute instance-attribute ¶

best_of class-attribute instance-attribute ¶

temperature class-attribute instance-attribute ¶

initial_prompt class-attribute instance-attribute ¶

without_timestamps class-attribute instance-attribute ¶

condition_on_previous_text class-attribute instance-attribute ¶

word_timestamps class-attribute instance-attribute ¶

max_new_tokens class-attribute instance-attribute ¶

chunk_length class-attribute instance-attribute ¶

language_detection_segments class-attribute instance-attribute ¶

language_detection_threshold class-attribute instance-attribute ¶

no_speech_threshold class-attribute instance-attribute ¶

vad_min_silence_duration_ms class-attribute instance-attribute ¶

vad_speech_pad_ms class-attribute instance-attribute ¶

repetition_penalty class-attribute instance-attribute ¶

no_repeat_ngram_size class-attribute instance-attribute ¶

hotwords class-attribute instance-attribute ¶

__init__ ¶

Transcriber ¶

model_name instance-attribute ¶

device instance-attribute ¶

transcribe ¶

FasterWhisperTranscriber ¶

model_name instance-attribute ¶

device instance-attribute ¶

compute_type instance-attribute ¶

__init__ ¶

from_preset classmethod ¶

from_model_preset classmethod ¶

transcribe ¶

backend_features ¶

normalize_options ¶

create_transcriber_from_preset ¶

TranscriptionOptions `dataclass` ¶

language `class-attribute` `instance-attribute` ¶

vad_filter `class-attribute` `instance-attribute` ¶

beam_size `class-attribute` `instance-attribute` ¶

best_of `class-attribute` `instance-attribute` ¶

temperature `class-attribute` `instance-attribute` ¶

initial_prompt `class-attribute` `instance-attribute` ¶

without_timestamps `class-attribute` `instance-attribute` ¶

condition_on_previous_text `class-attribute` `instance-attribute` ¶

word_timestamps `class-attribute` `instance-attribute` ¶

max_new_tokens `class-attribute` `instance-attribute` ¶

chunk_length `class-attribute` `instance-attribute` ¶

language_detection_segments `class-attribute` `instance-attribute` ¶

language_detection_threshold `class-attribute` `instance-attribute` ¶

no_speech_threshold `class-attribute` `instance-attribute` ¶

vad_min_silence_duration_ms `class-attribute` `instance-attribute` ¶

vad_speech_pad_ms `class-attribute` `instance-attribute` ¶

repetition_penalty `class-attribute` `instance-attribute` ¶

no_repeat_ngram_size `class-attribute` `instance-attribute` ¶

hotwords `class-attribute` `instance-attribute` ¶

init ¶

model_name `instance-attribute` ¶

device `instance-attribute` ¶

model_name `instance-attribute` ¶

device `instance-attribute` ¶

compute_type `instance-attribute` ¶

init ¶

from_preset `classmethod` ¶

from_model_preset `classmethod` ¶