Skip to content

Codecs

Overview

VoIP ships two tiers of audio codecs:

Extra required Codecs available
audio (includes numpy) PCMA (G.711 A-law), PCMU (G.711 µ-law)
hd-audio (includes numpy and pyav) + G.722, Opus

Install the minimal tier for pure-Python telephony deployments:

pip install voip[audio]

Install the full tier for wideband / Opus support via FFmpeg:

pip install voip[hd-audio]

SD audio

These codecs work without PyAV and require only numpy.

voip.codecs.pcma.PCMA

Bases: RTPCodec

G.711 A-law codec (RFC 3551 §4.5.14).

PCMA is the ITU-T G.711 A-law logarithmic companding codec for PSTN telephony, standardised in RFC 3551 with static payload type 8.

Both encode and decode interoperate bit-exactly with real RTP PCMA streams.

Source code in voip/codecs/pcma.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
class PCMA(RTPCodec):
    """G.711 A-law codec ([RFC 3551 §4.5.14][]).

    PCMA is the ITU-T G.711 A-law logarithmic companding codec for PSTN
    telephony, standardised in RFC 3551 with static payload type 8.

    Both encode and decode interoperate bit-exactly with real RTP PCMA
    streams.

    [RFC 3551 §4.5.14]: https://datatracker.ietf.org/doc/html/rfc3551#section-4.5.14
    """

    payload_type: ClassVar[int] = 8
    encoding_name: ClassVar[str] = "pcma"
    sample_rate_hz: ClassVar[int] = 8000
    rtp_clock_rate_hz: ClassVar[int] = 8000
    frame_size: ClassVar[int] = 160
    timestamp_increment: ClassVar[int] = 160
    channels: ClassVar[int] = 1

    @classmethod
    def decode(
        cls,
        payload: bytes,
        output_rate_hz: int,
        *,
        input_rate_hz: int | None = None,
    ) -> np.ndarray:
        # XOR with 0x55 to undo the G.711 bit-inversion applied before transmission.
        raw = np.frombuffer(payload, dtype=np.uint8) ^ np.uint8(0x55)
        sign = np.where(raw & 0x80, 1.0, -1.0).astype(np.float32)
        segment = ((raw & 0x70) >> 4).astype(np.int32)
        # t = mantissa bits shifted to the top of the 4-bit slot (×16).
        mantissa_t = ((raw & 0x0F).astype(np.int32)) << 4
        # Segment 0: add step mid-point (8).
        t_seg0 = mantissa_t + 8
        # Segments ≥ 1: add 0x108 bias then left-shift by (segment − 1).
        t_bias = mantissa_t + 0x108
        shift = np.maximum(segment - 1, 0)
        t_segN = np.left_shift(t_bias, shift)
        t = np.where(segment == 0, t_seg0, t_segN).astype(np.float32)
        normalized = (sign * t / 32768.0).astype(np.float32)
        source_rate_hz = (
            input_rate_hz if input_rate_hz is not None else cls.sample_rate_hz
        )
        return cls.resample(normalized, source_rate_hz, output_rate_hz)

    @classmethod
    def encode(cls, samples: np.ndarray) -> bytes:
        # Scale to 16-bit signed PCM range.
        pcm = np.clip(np.round(samples * 32768.0), -32768, 32767).astype(np.int32)
        # Positive samples: mask 0xD5.  Negative samples: mask 0x55, negate−1.
        magnitude = np.where(pcm >= 0, pcm, -pcm - 1).astype(np.int32)
        mask = np.where(pcm >= 0, np.uint8(0xD5), np.uint8(0x55))
        # Find the segment index (0–7) via vectorised binary search on the upper bounds.
        # side='left' counts thresholds strictly less than magnitude, i.e. exceeded.
        seg = np.minimum(
            np.searchsorted(_ALAW_SEG_UBOUND, magnitude, side="left").astype(np.int32),
            7,
        )
        # Extract the 4-bit mantissa in 16-bit space.
        # G.711 A-law quantises 13-bit PCM (16-bit >> 3), so the effective
        # mantissa shift is 4 for segments 0–1 and (seg + 3) for segments ≥ 2.
        shift = np.where(seg < 2, 4, seg + 3).astype(np.int32)
        mantissa = (np.right_shift(magnitude, shift) & 0x0F).astype(np.uint8)
        aval = (seg.astype(np.uint8) << 4) | mantissa
        return (aval ^ mask).astype(np.uint8).tobytes()

voip.codecs.pcmu.PCMU

Bases: RTPCodec

G.711 mu-law codec (RFC 3551 §4.5.14).

PCMU is the ITU-T G.711 mu-law logarithmic companding codec for PSTN telephony, standardised in RFC 3551 with static payload type 0.

Both encode and decode are pure-NumPy and require no PyAV dependency.

Source code in voip/codecs/pcmu.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
class PCMU(RTPCodec):
    """G.711 mu-law codec ([RFC 3551 §4.5.14][]).

    PCMU is the ITU-T G.711 mu-law logarithmic companding codec for PSTN
    telephony, standardised in RFC 3551 with static payload type 0.

    Both encode and decode are pure-NumPy and require no PyAV dependency.

    [RFC 3551 §4.5.14]: https://datatracker.ietf.org/doc/html/rfc3551#section-4.5.14
    """

    payload_type: ClassVar[int] = 0
    encoding_name: ClassVar[str] = "pcmu"
    sample_rate_hz: ClassVar[int] = 8000
    rtp_clock_rate_hz: ClassVar[int] = 8000
    frame_size: ClassVar[int] = 160
    timestamp_increment: ClassVar[int] = 160
    channels: ClassVar[int] = 1

    @classmethod
    def decode(
        cls,
        payload: bytes,
        output_rate_hz: int,
        *,
        input_rate_hz: int | None = None,
    ) -> np.ndarray:
        raw = (~np.frombuffer(payload, dtype=np.uint8)).astype(np.uint8)
        sign = np.where(raw & 0x80, 1.0, -1.0)
        exp = ((raw >> 4) & 0x07).astype(np.int32)
        mantissa = (raw & 0x0F).astype(np.int32)
        # ITU-T G.711 §A: magnitude = (((mantissa << 3) + BIAS) << exp) - BIAS
        magnitude = (((mantissa << 3) + _MU_LAW_BIAS) << exp) - _MU_LAW_BIAS
        linear = (magnitude.astype(np.float32) / 32768.0).astype(np.float32)
        source_rate_hz = (
            input_rate_hz if input_rate_hz is not None else cls.sample_rate_hz
        )
        return cls.resample(
            (sign * linear).astype(np.float32), source_rate_hz, output_rate_hz
        )

    @classmethod
    def encode(cls, samples: np.ndarray) -> bytes:
        pcm = np.clip(np.round(samples * 32768.0), -32768, 32767).astype(np.int32)
        sign = np.where(pcm >= 0, 0x80, 0x00).astype(np.uint8)
        biased = np.minimum(np.abs(pcm) + _MU_LAW_BIAS, _MU_LAW_CLIP)
        exp = np.clip(
            np.floor(np.log2(np.maximum(biased, 1))).astype(np.int32) - 7, 0, 7
        )
        mantissa = ((biased >> (exp + 3)) & 0x0F).astype(np.uint8)
        return (
            (~(sign | (exp.astype(np.uint8) << 4) | mantissa))
            .astype(np.uint8)
            .tobytes()
        )

HD audio

These codecs require the hd-audio extra (pip install voip[hd-audio]).

voip.codecs.g722.G722

Bases: PyAVCodec

G.722 wideband audio codec (RFC 3551 §4.5.2).

G.722 is an ITU-T ADPCM wideband codec. Despite encoding audio at 16 000 Hz, the RTP timestamp clock runs at 8 000 Hz per RFC 3551 — a well-known quirk of the original specification.

The entire buffer is encoded at once to preserve the ADPCM predictor state across packet boundaries.

Source code in voip/codecs/g722.py
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
class G722(PyAVCodec):
    """G.722 wideband audio codec ([RFC 3551 §4.5.2][]).

    G.722 is an ITU-T ADPCM wideband codec.  Despite encoding audio at
    16 000 Hz, the RTP timestamp clock runs at 8 000 Hz per RFC 3551 —
    a well-known quirk of the original specification.

    The entire buffer is encoded at once to preserve the ADPCM predictor
    state across packet boundaries.

    [RFC 3551 §4.5.2]: https://datatracker.ietf.org/doc/html/rfc3551#section-4.5.2
    """

    payload_type: ClassVar[int] = 9
    encoding_name: ClassVar[str] = "g722"
    #: Actual wideband audio sample rate in Hz.
    sample_rate_hz: ClassVar[int] = 16000
    #: RTP timestamp clock rate per RFC 3551 (8 kHz despite 16 kHz audio).
    rtp_clock_rate_hz: ClassVar[int] = 8000
    #: Audio samples per 20 ms frame at 16 kHz.
    frame_size: ClassVar[int] = 320
    #: RTP timestamp ticks per frame at the 8 kHz clock.
    timestamp_increment: ClassVar[int] = 160
    channels: ClassVar[int] = 1

    @classmethod
    def decode(
        cls,
        payload: bytes,
        output_rate_hz: int,
        *,
        input_rate_hz: int | None = None,
    ) -> np.ndarray:
        return cls.decode_pcm(
            payload,
            "g722",
            output_rate_hz,
            input_rate_hz=cls.rtp_clock_rate_hz,
        )

    @classmethod
    def encode(cls, samples: np.ndarray) -> bytes:
        return cls.encode_pcm(samples, "g722", cls.sample_rate_hz)

    @classmethod
    def packetize(cls, audio: np.ndarray) -> Iterator[bytes]:
        encoded = cls.encode(audio)
        # G.722 2:1 sample-to-byte ratio: frame_size (320) samples → 160 bytes.
        payload_size = cls.frame_size // 2
        for i in range(0, len(encoded), payload_size):
            yield encoded[i : i + payload_size]

    @classmethod
    def create_decoder(
        cls, output_rate_hz: int, *, input_rate_hz: int | None = None
    ) -> G722Decoder:
        """Create a stateful per-call G.722 decoder.

        Returns a [G722Decoder][voip.codecs.g722.G722Decoder] that preserves
        the ADPCM predictor state across consecutive RTP packets.  Pass the
        returned decoder to
        [AudioCall][voip.audio.AudioCall] (via the `create_decoder`
        factory) to avoid the per-packet state reset that causes robotic
        audio artefacts.

        The *input_rate_hz* parameter is accepted for API consistency with
        [RTPCodec.create_decoder][voip.codecs.base.RTPCodec.create_decoder]
        but is not used; G.722 always decodes at 16 000 Hz internally.

        Args:
            output_rate_hz: Target PCM sample rate in Hz for decoded audio.
            input_rate_hz: Ignored.  G.722 always decodes at `sample_rate_hz`.

        Returns:
            A new [G722Decoder][voip.codecs.g722.G722Decoder] instance.
        """
        return G722Decoder(output_rate_hz)

create_decoder(output_rate_hz, *, input_rate_hz=None) classmethod

Create a stateful per-call G.722 decoder.

Returns a G722Decoder that preserves the ADPCM predictor state across consecutive RTP packets. Pass the returned decoder to AudioCall (via the create_decoder factory) to avoid the per-packet state reset that causes robotic audio artefacts.

The input_rate_hz parameter is accepted for API consistency with RTPCodec.create_decoder but is not used; G.722 always decodes at 16 000 Hz internally.

Parameters:

Name Type Description Default
output_rate_hz int

Target PCM sample rate in Hz for decoded audio.

required
input_rate_hz int | None

Ignored. G.722 always decodes at sample_rate_hz.

None

Returns:

Type Description
G722Decoder

A new G722Decoder instance.

Source code in voip/codecs/g722.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
@classmethod
def create_decoder(
    cls, output_rate_hz: int, *, input_rate_hz: int | None = None
) -> G722Decoder:
    """Create a stateful per-call G.722 decoder.

    Returns a [G722Decoder][voip.codecs.g722.G722Decoder] that preserves
    the ADPCM predictor state across consecutive RTP packets.  Pass the
    returned decoder to
    [AudioCall][voip.audio.AudioCall] (via the `create_decoder`
    factory) to avoid the per-packet state reset that causes robotic
    audio artefacts.

    The *input_rate_hz* parameter is accepted for API consistency with
    [RTPCodec.create_decoder][voip.codecs.base.RTPCodec.create_decoder]
    but is not used; G.722 always decodes at 16 000 Hz internally.

    Args:
        output_rate_hz: Target PCM sample rate in Hz for decoded audio.
        input_rate_hz: Ignored.  G.722 always decodes at `sample_rate_hz`.

    Returns:
        A new [G722Decoder][voip.codecs.g722.G722Decoder] instance.
    """
    return G722Decoder(output_rate_hz)

voip.codecs.g722.G722Decoder dataclass

Stateful G.722 decoder that preserves ADPCM predictor state across packets.

Creates a single persistent av.CodecContext for the life of the decoder and feeds each incoming RTP packet to the same context. This eliminates the per-packet predictor reset that causes robotic artefacts when decoding a G.722 stream with independent codec contexts.

Use G722.create_decoder rather than instantiating this class directly.

Attributes:

Name Type Description
output_rate_hz int

Target PCM sample rate in Hz for decoded audio.

codec_context AudioCodecContext

Persistent G.722 decoder context shared across all decode calls on this instance.

resampler AudioResampler

Sampler targeting output_rate_hz Hz.

Source code in voip/codecs/g722.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
@dataclasses.dataclass(slots=True)
class G722Decoder:
    """Stateful G.722 decoder that preserves ADPCM predictor state across packets.

    Creates a single persistent
    [av.CodecContext](https://pyav.basswood-io.com/docs/stable/api/codec.html#av.codec.context.CodecContext)
    for the life of the decoder and feeds each incoming RTP packet to the
    same context.  This eliminates the per-packet predictor reset that causes
    robotic artefacts when decoding a G.722 stream with independent codec
    contexts.

    Use [G722.create_decoder][voip.codecs.g722.G722.create_decoder] rather
    than instantiating this class directly.

    Attributes:
        output_rate_hz: Target PCM sample rate in Hz for decoded audio.
        codec_context: Persistent G.722 decoder context
            shared across all [decode][voip.codecs.g722.G722Decoder.decode]
            calls on this instance.
        resampler: Sampler targeting `output_rate_hz` Hz.
    """

    output_rate_hz: int
    codec_context: av.AudioCodecContext = dataclasses.field(init=False, repr=False)
    resampler: av.audio.resampler.AudioResampler = dataclasses.field(
        init=False, repr=False
    )

    def __post_init__(self) -> None:
        self.codec_context = typing.cast(
            av.AudioCodecContext, av.CodecContext.create("g722", "r")
        )
        self.codec_context.sample_rate = G722.sample_rate_hz
        self.codec_context.open()
        self.resampler = av.audio.resampler.AudioResampler(
            format="fltp", layout="mono", rate=self.output_rate_hz
        )

    def decode(self, payload: bytes) -> np.ndarray:
        """Decode one G.722 RTP payload, preserving ADPCM state from prior packets.

        Args:
            payload: Raw G.722 RTP payload bytes (160 bytes per 20 ms frame).

        Returns:
            Float32 mono PCM array at `output_rate_hz` Hz.
        """
        frames = [
            resampled.to_ndarray().flatten()
            for frame in self.codec_context.decode(av.Packet(payload))
            for resampled in self.resampler.resample(frame)
        ]
        return np.concatenate(frames) if frames else np.array([], dtype=np.float32)

decode(payload)

Decode one G.722 RTP payload, preserving ADPCM state from prior packets.

Parameters:

Name Type Description Default
payload bytes

Raw G.722 RTP payload bytes (160 bytes per 20 ms frame).

required

Returns:

Type Description
ndarray

Float32 mono PCM array at output_rate_hz Hz.

Source code in voip/codecs/g722.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
def decode(self, payload: bytes) -> np.ndarray:
    """Decode one G.722 RTP payload, preserving ADPCM state from prior packets.

    Args:
        payload: Raw G.722 RTP payload bytes (160 bytes per 20 ms frame).

    Returns:
        Float32 mono PCM array at `output_rate_hz` Hz.
    """
    frames = [
        resampled.to_ndarray().flatten()
        for frame in self.codec_context.decode(av.Packet(payload))
        for resampled in self.resampler.resample(frame)
    ]
    return np.concatenate(frames) if frames else np.array([], dtype=np.float32)

voip.codecs.opus.Opus

Bases: PyAVCodec

Opus audio codec (RFC 7587).

Opus is a highly flexible codec for interactive real-time speech and audio transmission. It uses dynamic payload type 111 and always operates at 48 000 Hz internally.

Incoming RTP payloads are wrapped in a minimal Ogg container before being passed to PyAV. Outbound PCM is encoded via libopus.

Source code in voip/codecs/opus.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
class Opus(PyAVCodec):
    """Opus audio codec ([RFC 7587][]).

    Opus is a highly flexible codec for interactive real-time speech and audio
    transmission. It uses dynamic payload type 111 and always operates at
    48 000 Hz internally.

    Incoming RTP payloads are wrapped in a minimal [Ogg][] container before
    being passed to PyAV. Outbound PCM is encoded via `libopus`.

    [RFC 7587]: https://datatracker.ietf.org/doc/html/rfc7587
    [Ogg]: https://wiki.xiph.org/Ogg
    """

    payload_type: ClassVar[int] = 111
    encoding_name: ClassVar[str] = "opus"
    sample_rate_hz: ClassVar[int] = 48000
    rtp_clock_rate_hz: ClassVar[int] = 48000
    frame_size: ClassVar[int] = 960
    timestamp_increment: ClassVar[int] = 960
    channels: ClassVar[int] = 2

    @staticmethod
    def _ogg_crc32(data: bytes) -> int:
        """Compute an Ogg CRC32 checksum (polynomial 0x04C11DB7).

        Args:
            data: Bytes to checksum.

        Returns:
            32-bit CRC value.
        """
        crc = 0
        for byte in data:
            crc ^= byte << 24
            for _ in range(8):
                crc = (crc << 1) ^ (0x04C11DB7 if crc & 0x80000000 else 0)
        return crc & 0xFFFFFFFF

    @classmethod
    def _ogg_page(
        cls,
        header_type: int,
        granule_position: int,
        serial_number: int,
        sequence_number: int,
        packets: list[bytes],
    ) -> bytes:
        """Build a single Ogg page ([RFC 3533](https://datatracker.ietf.org/doc/html/rfc3533)).

        Args:
            header_type: Page header type flags (e.g. `0x02` for BOS, `0x04` for EOS).
            granule_position: Granule position for this page.
            serial_number: Stream serial number.
            sequence_number: Page sequence number within the stream.
            packets: Packet byte strings to include in this page.

        Returns:
            Complete Ogg page bytes including CRC.
        """
        lacing: list[int] = []
        for packet in packets:
            remaining = len(packet)
            while remaining >= 255:
                lacing.append(255)
                remaining -= 255
            lacing.append(remaining)
        header = struct.pack(
            "<4sBBqIIIB",
            b"OggS",
            0,  # stream structure version
            header_type,
            granule_position,
            serial_number,
            sequence_number,
            0,  # CRC placeholder
            len(lacing),
        ) + bytes(lacing)
        page = header + b"".join(packets)
        return page[:22] + struct.pack("<I", cls._ogg_crc32(page)) + page[26:]

    @classmethod
    def _ogg_container(cls, packet: bytes) -> bytes:
        """Wrap a raw Opus RTP payload in a minimal Ogg Opus container.

        Produces a three-page Ogg stream: BOS (OpusHead), comment
        (OpusTags), and the single data page.  Opus always uses 48 000 Hz
        internally ([RFC 7587 §4](https://datatracker.ietf.org/doc/html/rfc7587#section-4)).

        Args:
            packet: Raw Opus RTP payload bytes.

        Returns:
            Ogg Opus container bytes suitable for PyAV decoding.
        """
        serial_number = int.from_bytes(os.urandom(4), "little")
        vendor = b"voip"
        opus_head = struct.pack(
            "<8sBBHIhB",
            b"OpusHead",
            1,  # version
            1,  # channel count (mono)
            3840,  # pre-skip: 80 ms at 48 kHz (RFC 7587)
            cls.sample_rate_hz,
            0,  # output gain
            0,  # channel mapping family (mono/stereo)
        )
        opus_tags = (
            struct.pack("<8sI", b"OpusTags", len(vendor))
            + vendor
            + struct.pack("<I", 0)  # zero user comments
        )
        return b"".join(
            [
                cls._ogg_page(0x02, 0, serial_number, 0, [opus_head]),  # BOS
                cls._ogg_page(0x00, 0, serial_number, 1, [opus_tags]),
                cls._ogg_page(0x04, 0, serial_number, 2, [packet]),
            ]
        )

    @classmethod
    def decode(
        cls,
        payload: bytes,
        output_rate_hz: int,
        *,
        input_rate_hz: int | None = None,
    ) -> np.ndarray:
        return cls.decode_pcm(cls._ogg_container(payload), "ogg", output_rate_hz)

    @classmethod
    def encode(cls, samples: np.ndarray) -> bytes:
        return cls.encode_pcm(samples, "libopus", cls.sample_rate_hz)

Registry

voip.codecs.get(encoding_name)

Get a codec class by its SDP encoding name.

Parameters:

Name Type Description Default
encoding_name str

SDP encoding name, case-insensitive (e.g. "opus", "G722", "PCMA").

required

Returns:

Type Description
type[RTPCodec]

Matching codec class.

Raises:

Type Description
NotImplementedError

When no registered codec matches encoding_name.

Source code in voip/codecs/__init__.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def get(encoding_name: str) -> type[RTPCodec]:
    """Get a codec class by its SDP encoding name.

    Args:
        encoding_name: SDP encoding name, case-insensitive
            (e.g. `"opus"`, `"G722"`, `"PCMA"`).

    Returns:
        Matching codec class.

    Raises:
        NotImplementedError: When no registered codec matches *encoding_name*.
    """
    try:
        return REGISTRY[encoding_name.lower()]
    except KeyError:
        raise NotImplementedError(
            f"Unsupported codec: {encoding_name!r}. Supported: {list(REGISTRY)!r}"
        )

Base classes

voip.codecs.base.RTPCodec

Base class for RTP audio codecs.

Concrete implementations: Opus, G722, PCMA, PCMU.

Codec classes are stateless; every method is a classmethod or staticmethod and codecs are referenced as type[RTPCodec], never instantiated. Per-call decoder state (required for ADPCM codecs such as G.722) is managed by PayloadDecoder instances returned by create_decoder.

Concrete subclasses define codec-specific class variables and override decode, encode, and optionally packetize.

Subclasses may use the shared PyAV-backed helpers or implement decode and encode using alternative backends such as NumPy.

Subclasses that produce variable-length output across frames (e.g. G.722 ADPCM) should override packetize to encode the whole buffer at once and preserve predictor state.

Subclasses that require PyAV additionally inherit from PyAVCodec.

Source code in voip/codecs/base.py
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class RTPCodec:
    """Base class for RTP audio codecs.

    Concrete implementations: [Opus][voip.codecs.Opus],
    [G722][voip.codecs.G722], [PCMA][voip.codecs.pcma.PCMA],
    [PCMU][voip.codecs.pcmu.PCMU].

    Codec classes are stateless; every method is a classmethod or staticmethod
    and codecs are referenced as `type[RTPCodec]`, never instantiated.
    Per-call decoder state (required for ADPCM codecs such as G.722) is
    managed by [PayloadDecoder][voip.codecs.base.PayloadDecoder] instances
    returned by [create_decoder][voip.codecs.base.RTPCodec.create_decoder].

    Concrete subclasses define codec-specific class variables and override
    [decode][voip.codecs.base.RTPCodec.decode],
    [encode][voip.codecs.base.RTPCodec.encode], and optionally
    [packetize][voip.codecs.base.RTPCodec.packetize].

    Subclasses may use the shared PyAV-backed helpers or implement
    [decode][voip.codecs.base.RTPCodec.decode] and
    [encode][voip.codecs.base.RTPCodec.encode] using alternative backends
    such as NumPy.

    Subclasses that produce variable-length output across frames (e.g. G.722
    ADPCM) should override `packetize` to encode the whole buffer at once and
    preserve predictor state.

    Subclasses that require [PyAV][] additionally inherit from
    [PyAVCodec][voip.codecs.av.PyAVCodec].

    [PyAV]: https://pyav.basswood-io.com/
    """

    payload_type: ClassVar[int]
    """RTP payload type number."""

    encoding_name: ClassVar[str]
    """SDP encoding name (lowercase)."""

    sample_rate_hz: ClassVar[int]
    """Actual audio sample rate in Hz."""

    rtp_clock_rate_hz: ClassVar[int]
    """RTP timestamp clock rate in Hz (may differ from `sample_rate_hz`)."""

    frame_size: ClassVar[int]
    """Audio samples per 20 ms RTP frame at `sample_rate_hz`."""

    timestamp_increment: ClassVar[int]
    """RTP timestamp ticks per frame at `rtp_clock_rate_hz`."""

    channels: ClassVar[int]
    """Channel count (1 = mono, 2 = stereo)."""

    @classmethod
    def resample(
        cls, audio: np.ndarray, source_rate_hz: int, destination_rate_hz: int
    ) -> np.ndarray:
        """Resample *audio* from *source_rate_hz* to *destination_rate_hz*.

        Uses linear interpolation via [numpy.interp][].

        Args:
            audio: Float32 mono PCM array.
            source_rate_hz: Sample rate of *audio* in Hz.
            destination_rate_hz: Target sample rate in Hz.

        Returns:
            Resampled float32 array at *destination_rate_hz* Hz, or *audio*
            unchanged when both rates are equal.
        """
        if source_rate_hz == destination_rate_hz:
            return audio
        if len(audio) == 0:
            return np.empty(0, dtype=np.float32)
        n_out = max(1, round(len(audio) * destination_rate_hz / source_rate_hz))
        return np.interp(
            np.linspace(0, len(audio) - 1, n_out),
            np.arange(len(audio)),
            audio,
        ).astype(np.float32)

    @classmethod
    def to_payload_format(cls) -> RTPPayloadFormat:
        """Create an [RTPPayloadFormat][voip.sdp.types.RTPPayloadFormat] for SDP negotiation.

        Uses `rtp_clock_rate_hz` as the SDP sample rate, which is correct
        per RFC 3551 (e.g. G.722 advertises 8000 Hz in SDP even though the
        actual audio runs at 16000 Hz).

        Returns:
            Payload format descriptor for this codec.
        """
        return RTPPayloadFormat(
            payload_type=cls.payload_type,
            encoding_name=cls.encoding_name,
            sample_rate=cls.rtp_clock_rate_hz,
            channels=cls.channels,
        )

    @classmethod
    def decode(
        cls,
        payload: bytes,
        output_rate_hz: int,
        *,
        input_rate_hz: int | None = None,
    ) -> np.ndarray:
        """Decode an RTP payload to float32 mono PCM.

        Override in subclasses to implement codec-specific decoding.

        Args:
            payload: Raw RTP payload bytes.
            output_rate_hz: Target sample rate in Hz.
            input_rate_hz: Input clock rate override in Hz, or `None` to use
                the subclass default.

        Returns:
            Float32 mono PCM array at *output_rate_hz* Hz.
        """
        raise NotImplementedError(f"{cls.__name__} does not implement decode.")

    @classmethod
    def create_decoder(
        cls, output_rate_hz: int, *, input_rate_hz: int | None = None
    ) -> PayloadDecoder:
        """Create a stateless per-call payload decoder for this codec.

        Override in subclasses that require stateful decoding across RTP
        packets (e.g. G.722 ADPCM — see
        [G722.create_decoder][voip.codecs.g722.G722.create_decoder]).

        Args:
            output_rate_hz: Target PCM sample rate in Hz for decoded audio.
            input_rate_hz: Input clock rate override, or `None` to use the
                codec default.

        Returns:
            A [PayloadDecoder][voip.codecs.base.PayloadDecoder] that, by
            default, is a [PerPacketDecoder][voip.codecs.base.PerPacketDecoder]
            delegating each call to [decode][voip.codecs.base.RTPCodec.decode].
        """
        return PerPacketDecoder(cls, output_rate_hz, input_rate_hz)

    @classmethod
    def encode(cls, samples: np.ndarray) -> bytes:
        """Encode float32 mono PCM to an RTP payload.

        Override in subclasses to implement codec-specific encoding.

        Args:
            samples: Float32 mono PCM at `sample_rate_hz` Hz.

        Returns:
            Encoded bytes for one RTP payload.
        """
        raise NotImplementedError(f"{cls.__name__} does not implement encode.")

    @classmethod
    def packetize(cls, audio: np.ndarray) -> Iterator[bytes]:
        """Encode *audio* and yield one encoded payload per 20 ms RTP frame.

        The default implementation encodes one `frame_size` chunk at a time
        using `encode`. Override in subclasses (e.g. G.722) where the entire
        buffer must be encoded at once to preserve codec state.

        Args:
            audio: Float32 mono PCM at `sample_rate_hz` Hz.

        Yields:
            Encoded payload bytes, one per RTP packet.
        """
        for i in range(0, len(audio), cls.frame_size):
            yield cls.encode(audio[i : i + cls.frame_size])

channels class-attribute

Channel count (1 = mono, 2 = stereo).

encoding_name class-attribute

SDP encoding name (lowercase).

frame_size class-attribute

Audio samples per 20 ms RTP frame at sample_rate_hz.

payload_type class-attribute

RTP payload type number.

rtp_clock_rate_hz class-attribute

RTP timestamp clock rate in Hz (may differ from sample_rate_hz).

sample_rate_hz class-attribute

Actual audio sample rate in Hz.

timestamp_increment class-attribute

RTP timestamp ticks per frame at rtp_clock_rate_hz.

create_decoder(output_rate_hz, *, input_rate_hz=None) classmethod

Create a stateless per-call payload decoder for this codec.

Override in subclasses that require stateful decoding across RTP packets (e.g. G.722 ADPCM — see G722.create_decoder).

Parameters:

Name Type Description Default
output_rate_hz int

Target PCM sample rate in Hz for decoded audio.

required
input_rate_hz int | None

Input clock rate override, or None to use the codec default.

None

Returns:

Type Description
PayloadDecoder

A PayloadDecoder that, by

PayloadDecoder

default, is a PerPacketDecoder

PayloadDecoder

delegating each call to decode.

Source code in voip/codecs/base.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
@classmethod
def create_decoder(
    cls, output_rate_hz: int, *, input_rate_hz: int | None = None
) -> PayloadDecoder:
    """Create a stateless per-call payload decoder for this codec.

    Override in subclasses that require stateful decoding across RTP
    packets (e.g. G.722 ADPCM — see
    [G722.create_decoder][voip.codecs.g722.G722.create_decoder]).

    Args:
        output_rate_hz: Target PCM sample rate in Hz for decoded audio.
        input_rate_hz: Input clock rate override, or `None` to use the
            codec default.

    Returns:
        A [PayloadDecoder][voip.codecs.base.PayloadDecoder] that, by
        default, is a [PerPacketDecoder][voip.codecs.base.PerPacketDecoder]
        delegating each call to [decode][voip.codecs.base.RTPCodec.decode].
    """
    return PerPacketDecoder(cls, output_rate_hz, input_rate_hz)

decode(payload, output_rate_hz, *, input_rate_hz=None) classmethod

Decode an RTP payload to float32 mono PCM.

Override in subclasses to implement codec-specific decoding.

Parameters:

Name Type Description Default
payload bytes

Raw RTP payload bytes.

required
output_rate_hz int

Target sample rate in Hz.

required
input_rate_hz int | None

Input clock rate override in Hz, or None to use the subclass default.

None

Returns:

Type Description
ndarray

Float32 mono PCM array at output_rate_hz Hz.

Source code in voip/codecs/base.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
@classmethod
def decode(
    cls,
    payload: bytes,
    output_rate_hz: int,
    *,
    input_rate_hz: int | None = None,
) -> np.ndarray:
    """Decode an RTP payload to float32 mono PCM.

    Override in subclasses to implement codec-specific decoding.

    Args:
        payload: Raw RTP payload bytes.
        output_rate_hz: Target sample rate in Hz.
        input_rate_hz: Input clock rate override in Hz, or `None` to use
            the subclass default.

    Returns:
        Float32 mono PCM array at *output_rate_hz* Hz.
    """
    raise NotImplementedError(f"{cls.__name__} does not implement decode.")

encode(samples) classmethod

Encode float32 mono PCM to an RTP payload.

Override in subclasses to implement codec-specific encoding.

Parameters:

Name Type Description Default
samples ndarray

Float32 mono PCM at sample_rate_hz Hz.

required

Returns:

Type Description
bytes

Encoded bytes for one RTP payload.

Source code in voip/codecs/base.py
195
196
197
198
199
200
201
202
203
204
205
206
207
@classmethod
def encode(cls, samples: np.ndarray) -> bytes:
    """Encode float32 mono PCM to an RTP payload.

    Override in subclasses to implement codec-specific encoding.

    Args:
        samples: Float32 mono PCM at `sample_rate_hz` Hz.

    Returns:
        Encoded bytes for one RTP payload.
    """
    raise NotImplementedError(f"{cls.__name__} does not implement encode.")

packetize(audio) classmethod

Encode audio and yield one encoded payload per 20 ms RTP frame.

The default implementation encodes one frame_size chunk at a time using encode. Override in subclasses (e.g. G.722) where the entire buffer must be encoded at once to preserve codec state.

Parameters:

Name Type Description Default
audio ndarray

Float32 mono PCM at sample_rate_hz Hz.

required

Yields:

Type Description
bytes

Encoded payload bytes, one per RTP packet.

Source code in voip/codecs/base.py
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
@classmethod
def packetize(cls, audio: np.ndarray) -> Iterator[bytes]:
    """Encode *audio* and yield one encoded payload per 20 ms RTP frame.

    The default implementation encodes one `frame_size` chunk at a time
    using `encode`. Override in subclasses (e.g. G.722) where the entire
    buffer must be encoded at once to preserve codec state.

    Args:
        audio: Float32 mono PCM at `sample_rate_hz` Hz.

    Yields:
        Encoded payload bytes, one per RTP packet.
    """
    for i in range(0, len(audio), cls.frame_size):
        yield cls.encode(audio[i : i + cls.frame_size])

resample(audio, source_rate_hz, destination_rate_hz) classmethod

Resample audio from source_rate_hz to destination_rate_hz.

Uses linear interpolation via numpy.interp.

Parameters:

Name Type Description Default
audio ndarray

Float32 mono PCM array.

required
source_rate_hz int

Sample rate of audio in Hz.

required
destination_rate_hz int

Target sample rate in Hz.

required

Returns:

Type Description
ndarray

Resampled float32 array at destination_rate_hz Hz, or audio

ndarray

unchanged when both rates are equal.

Source code in voip/codecs/base.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
@classmethod
def resample(
    cls, audio: np.ndarray, source_rate_hz: int, destination_rate_hz: int
) -> np.ndarray:
    """Resample *audio* from *source_rate_hz* to *destination_rate_hz*.

    Uses linear interpolation via [numpy.interp][].

    Args:
        audio: Float32 mono PCM array.
        source_rate_hz: Sample rate of *audio* in Hz.
        destination_rate_hz: Target sample rate in Hz.

    Returns:
        Resampled float32 array at *destination_rate_hz* Hz, or *audio*
        unchanged when both rates are equal.
    """
    if source_rate_hz == destination_rate_hz:
        return audio
    if len(audio) == 0:
        return np.empty(0, dtype=np.float32)
    n_out = max(1, round(len(audio) * destination_rate_hz / source_rate_hz))
    return np.interp(
        np.linspace(0, len(audio) - 1, n_out),
        np.arange(len(audio)),
        audio,
    ).astype(np.float32)

to_payload_format() classmethod

Create an RTPPayloadFormat for SDP negotiation.

Uses rtp_clock_rate_hz as the SDP sample rate, which is correct per RFC 3551 (e.g. G.722 advertises 8000 Hz in SDP even though the actual audio runs at 16000 Hz).

Returns:

Type Description
RTPPayloadFormat

Payload format descriptor for this codec.

Source code in voip/codecs/base.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
@classmethod
def to_payload_format(cls) -> RTPPayloadFormat:
    """Create an [RTPPayloadFormat][voip.sdp.types.RTPPayloadFormat] for SDP negotiation.

    Uses `rtp_clock_rate_hz` as the SDP sample rate, which is correct
    per RFC 3551 (e.g. G.722 advertises 8000 Hz in SDP even though the
    actual audio runs at 16000 Hz).

    Returns:
        Payload format descriptor for this codec.
    """
    return RTPPayloadFormat(
        payload_type=cls.payload_type,
        encoding_name=cls.encoding_name,
        sample_rate=cls.rtp_clock_rate_hz,
        channels=cls.channels,
    )

voip.codecs.base.PayloadDecoder

Bases: Protocol

Protocol for per-call RTP payload decoders.

Implementations decode raw RTP payload bytes to float32 mono PCM. Stateful implementations (e.g. G722Decoder) preserve codec predictor state across successive decode calls within a single call session.

Source code in voip/codecs/base.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
class PayloadDecoder(Protocol):
    """Protocol for per-call RTP payload decoders.

    Implementations decode raw RTP payload bytes to float32 mono PCM.
    Stateful implementations (e.g. [G722Decoder][voip.codecs.g722.G722Decoder])
    preserve codec predictor state across successive
    [decode][voip.codecs.base.PayloadDecoder.decode] calls within a single
    call session.
    """

    def decode(self, payload: bytes) -> np.ndarray:
        """Decode one RTP payload to float32 mono PCM.

        Args:
            payload: Raw RTP payload bytes for a single packet.

        Returns:
            Float32 mono PCM array.
        """
        ...

decode(payload)

Decode one RTP payload to float32 mono PCM.

Parameters:

Name Type Description Default
payload bytes

Raw RTP payload bytes for a single packet.

required

Returns:

Type Description
ndarray

Float32 mono PCM array.

Source code in voip/codecs/base.py
38
39
40
41
42
43
44
45
46
47
def decode(self, payload: bytes) -> np.ndarray:
    """Decode one RTP payload to float32 mono PCM.

    Args:
        payload: Raw RTP payload bytes for a single packet.

    Returns:
        Float32 mono PCM array.
    """
    ...

voip.codecs.base.PerPacketDecoder dataclass

Stateless payload decoder that processes each RTP packet independently.

Delegate each call to RTPCodec.decode, decoding each payload independently without preserving cross-packet state. Suitable for stateless codecs such as PCMA, PCMU, and Opus.

Attributes:

Name Type Description
codec type[RTPCodec]

Codec class to delegate decoding to.

output_rate_hz int

Target PCM sample rate in Hz.

input_rate_hz int | None

Input clock rate override, or None to use the codec default.

Source code in voip/codecs/base.py
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
@dataclasses.dataclass(frozen=True)
class PerPacketDecoder:
    """Stateless payload decoder that processes each RTP packet independently.

    Delegate each call to
    [RTPCodec.decode][voip.codecs.base.RTPCodec.decode], decoding each
    payload independently without preserving cross-packet state. Suitable for
    stateless codecs such as PCMA, PCMU, and Opus.

    Attributes:
        codec: Codec class to delegate decoding to.
        output_rate_hz: Target PCM sample rate in Hz.
        input_rate_hz: Input clock rate override, or `None` to use the codec default.
    """

    codec: type[RTPCodec]
    output_rate_hz: int
    input_rate_hz: int | None = None

    def decode(self, payload: bytes) -> np.ndarray:
        """Decode one RTP payload to float32 PCM.

        Args:
            payload: Raw RTP payload bytes.

        Returns:
            Float32 mono PCM array at `output_rate_hz` Hz.
        """
        return self.codec.decode(
            payload, self.output_rate_hz, input_rate_hz=self.input_rate_hz
        )

decode(payload)

Decode one RTP payload to float32 PCM.

Parameters:

Name Type Description Default
payload bytes

Raw RTP payload bytes.

required

Returns:

Type Description
ndarray

Float32 mono PCM array at output_rate_hz Hz.

Source code in voip/codecs/base.py
246
247
248
249
250
251
252
253
254
255
256
257
def decode(self, payload: bytes) -> np.ndarray:
    """Decode one RTP payload to float32 PCM.

    Args:
        payload: Raw RTP payload bytes.

    Returns:
        Float32 mono PCM array at `output_rate_hz` Hz.
    """
    return self.codec.decode(
        payload, self.output_rate_hz, input_rate_hz=self.input_rate_hz
    )

voip.codecs.av.PyAVCodec

Bases: RTPCodec

RTP codec that decodes and encodes audio via PyAV.

Concrete implementations: Opus, G722.

Source code in voip/codecs/av.py
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
class PyAVCodec(RTPCodec):
    """RTP codec that decodes and encodes audio via [PyAV][].

    Concrete implementations: [Opus][voip.codecs.Opus],
    [G722][voip.codecs.G722].

    [PyAV]: https://pyav.basswood-io.com/
    """

    @classmethod
    def decode_pcm(
        cls,
        data: bytes,
        av_format: str,
        output_rate_hz: int,
        *,
        input_rate_hz: int | None = None,
    ) -> np.ndarray:
        """Decode raw audio bytes via PyAV into float32 mono PCM.

        Args:
            data: Raw audio bytes in the codec's wire format.
            av_format: PyAV format string (e.g. `"ogg"`, `"alaw"`).
            output_rate_hz: Target sample rate in Hz.
            input_rate_hz: Input clock rate hint for the PyAV decoder, or
                `None` for self-describing formats like Ogg.

        Returns:
            Float32 mono PCM array at *output_rate_hz* Hz.
        """
        resampler = av.audio.resampler.AudioResampler(
            format="fltp", layout="mono", rate=output_rate_hz
        )
        frames: list[np.ndarray] = []
        with av.open(
            io.BytesIO(data),
            mode="r",
            format=av_format,
            options=(
                {"sample_rate": str(input_rate_hz)} if input_rate_hz is not None else {}
            ),
        ) as container:
            for frame in container.decode(audio=0):
                for resampled in resampler.resample(frame):
                    frames.append(resampled.to_ndarray().flatten())
        for resampled in resampler.resample(None):
            frames.append(resampled.to_ndarray().flatten())
        return np.concatenate(frames) if frames else np.array([], dtype=np.float32)

    @classmethod
    def encode_pcm(
        cls,
        samples: np.ndarray,
        av_codec_name: str,
        sample_rate_hz: int,
    ) -> bytes:
        """Encode float32 mono PCM to raw codec bytes via PyAV.

        Args:
            samples: Float32 mono PCM array in the range `[-1, 1]`.
            av_codec_name: PyAV codec name (e.g. `"g722"` or `"libopus"`).
            sample_rate_hz: Sample rate of *samples* in Hz.

        Returns:
            Encoded audio bytes.
        """
        codec: av.AudioCodecContext = cast(
            av.AudioCodecContext, av.CodecContext.create(av_codec_name, "w")
        )
        codec.sample_rate = sample_rate_hz
        codec.format = av.AudioFormat("s16")
        codec.layout = av.AudioLayout("mono")
        codec.open()
        pcm = np.clip(np.round(samples * 32768.0), -32768, 32767).astype(np.int16)
        frame = av.AudioFrame.from_ndarray(
            pcm[np.newaxis, :], format="s16", layout="mono"
        )
        frame.sample_rate = sample_rate_hz
        frame.pts = 0
        return b"".join(
            bytes(packet)
            for segment in (codec.encode(frame), codec.encode(None))
            for packet in segment
        )

decode_pcm(data, av_format, output_rate_hz, *, input_rate_hz=None) classmethod

Decode raw audio bytes via PyAV into float32 mono PCM.

Parameters:

Name Type Description Default
data bytes

Raw audio bytes in the codec's wire format.

required
av_format str

PyAV format string (e.g. "ogg", "alaw").

required
output_rate_hz int

Target sample rate in Hz.

required
input_rate_hz int | None

Input clock rate hint for the PyAV decoder, or None for self-describing formats like Ogg.

None

Returns:

Type Description
ndarray

Float32 mono PCM array at output_rate_hz Hz.

Source code in voip/codecs/av.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
@classmethod
def decode_pcm(
    cls,
    data: bytes,
    av_format: str,
    output_rate_hz: int,
    *,
    input_rate_hz: int | None = None,
) -> np.ndarray:
    """Decode raw audio bytes via PyAV into float32 mono PCM.

    Args:
        data: Raw audio bytes in the codec's wire format.
        av_format: PyAV format string (e.g. `"ogg"`, `"alaw"`).
        output_rate_hz: Target sample rate in Hz.
        input_rate_hz: Input clock rate hint for the PyAV decoder, or
            `None` for self-describing formats like Ogg.

    Returns:
        Float32 mono PCM array at *output_rate_hz* Hz.
    """
    resampler = av.audio.resampler.AudioResampler(
        format="fltp", layout="mono", rate=output_rate_hz
    )
    frames: list[np.ndarray] = []
    with av.open(
        io.BytesIO(data),
        mode="r",
        format=av_format,
        options=(
            {"sample_rate": str(input_rate_hz)} if input_rate_hz is not None else {}
        ),
    ) as container:
        for frame in container.decode(audio=0):
            for resampled in resampler.resample(frame):
                frames.append(resampled.to_ndarray().flatten())
    for resampled in resampler.resample(None):
        frames.append(resampled.to_ndarray().flatten())
    return np.concatenate(frames) if frames else np.array([], dtype=np.float32)

encode_pcm(samples, av_codec_name, sample_rate_hz) classmethod

Encode float32 mono PCM to raw codec bytes via PyAV.

Parameters:

Name Type Description Default
samples ndarray

Float32 mono PCM array in the range [-1, 1].

required
av_codec_name str

PyAV codec name (e.g. "g722" or "libopus").

required
sample_rate_hz int

Sample rate of samples in Hz.

required

Returns:

Type Description
bytes

Encoded audio bytes.

Source code in voip/codecs/av.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
@classmethod
def encode_pcm(
    cls,
    samples: np.ndarray,
    av_codec_name: str,
    sample_rate_hz: int,
) -> bytes:
    """Encode float32 mono PCM to raw codec bytes via PyAV.

    Args:
        samples: Float32 mono PCM array in the range `[-1, 1]`.
        av_codec_name: PyAV codec name (e.g. `"g722"` or `"libopus"`).
        sample_rate_hz: Sample rate of *samples* in Hz.

    Returns:
        Encoded audio bytes.
    """
    codec: av.AudioCodecContext = cast(
        av.AudioCodecContext, av.CodecContext.create(av_codec_name, "w")
    )
    codec.sample_rate = sample_rate_hz
    codec.format = av.AudioFormat("s16")
    codec.layout = av.AudioLayout("mono")
    codec.open()
    pcm = np.clip(np.round(samples * 32768.0), -32768, 32767).astype(np.int16)
    frame = av.AudioFrame.from_ndarray(
        pcm[np.newaxis, :], format="s16", layout="mono"
    )
    frame.sample_rate = sample_rate_hz
    frame.pts = 0
    return b"".join(
        bytes(packet)
        for segment in (codec.encode(frame), codec.encode(None))
        for packet in segment
    )