Skip to content

Codecs

Overview

VoIP ships two tiers of audio codecs:

Extra required Codecs available
audio (includes numpy) PCMA (G.711 A-law), PCMU (G.711 µ-law)
hd-audio (includes numpy and pyav) + G.722, Opus

Install the minimal tier for pure-Python telephony deployments:

pip install voip[audio]

Install the full tier for wideband / Opus support via FFmpeg:

pip install voip[hd-audio]

SD audio

These codecs work without PyAV and require only numpy.

voip.codecs.pcma.PCMA

Bases: RTPCodec

G.711 A-law codec (RFC 3551 §4.5.14).

PCMA is the ITU-T G.711 A-law logarithmic companding codec for PSTN telephony, standardised in RFC 3551 with static payload type 8.

Both encode and decode interoperate bit-exactly with real RTP PCMA streams.

Source code in voip/codecs/pcma.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
class PCMA(RTPCodec):
    """G.711 A-law codec ([RFC 3551 §4.5.14][]).

    PCMA is the ITU-T G.711 A-law logarithmic companding codec for PSTN
    telephony, standardised in RFC 3551 with static payload type 8.

    Both encode and decode interoperate bit-exactly with real RTP PCMA
    streams.

    [RFC 3551 §4.5.14]: https://datatracker.ietf.org/doc/html/rfc3551#section-4.5.14
    """

    payload_type: ClassVar[int] = 8
    encoding_name: ClassVar[str] = "pcma"
    sample_rate_hz: ClassVar[int] = 8000
    rtp_clock_rate_hz: ClassVar[int] = 8000
    frame_size: ClassVar[int] = 160
    timestamp_increment: ClassVar[int] = 160
    channels: ClassVar[int] = 1

    @classmethod
    def decode(
        cls,
        payload: bytes,
        output_rate_hz: int,
        *,
        input_rate_hz: int | None = None,
    ) -> np.ndarray:
        # XOR with 0x55 to undo the G.711 bit-inversion applied before transmission.
        raw = np.frombuffer(payload, dtype=np.uint8) ^ np.uint8(0x55)
        sign = np.where(raw & 0x80, 1.0, -1.0).astype(np.float32)
        segment = ((raw & 0x70) >> 4).astype(np.int32)
        # t = mantissa bits shifted to the top of the 4-bit slot (×16).
        mantissa_t = ((raw & 0x0F).astype(np.int32)) << 4
        # Segment 0: add step mid-point (8).
        t_seg0 = mantissa_t + 8
        # Segments ≥ 1: add 0x108 bias then left-shift by (segment − 1).
        t_bias = mantissa_t + 0x108
        shift = np.maximum(segment - 1, 0)
        t_segN = np.left_shift(t_bias, shift)
        t = np.where(segment == 0, t_seg0, t_segN).astype(np.float32)
        normalized = (sign * t / 32768.0).astype(np.float32)
        source_rate_hz = (
            input_rate_hz if input_rate_hz is not None else cls.sample_rate_hz
        )
        return cls.resample(normalized, source_rate_hz, output_rate_hz)

    @classmethod
    def encode(cls, samples: np.ndarray) -> bytes:
        # Scale to 16-bit signed PCM range.
        pcm = np.clip(np.round(samples * 32768.0), -32768, 32767).astype(np.int32)
        # Positive samples: mask 0xD5.  Negative samples: mask 0x55, negate−1.
        magnitude = np.where(pcm >= 0, pcm, -pcm - 1).astype(np.int32)
        mask = np.where(pcm >= 0, np.uint8(0xD5), np.uint8(0x55))
        # Find the segment index (0–7) via vectorised binary search on the upper bounds.
        # side='left' counts thresholds strictly less than magnitude, i.e. exceeded.
        seg = np.minimum(
            np.searchsorted(_ALAW_SEG_UBOUND, magnitude, side="left").astype(np.int32),
            7,
        )
        # Extract the 4-bit mantissa in 16-bit space.
        # G.711 A-law quantises 13-bit PCM (16-bit >> 3), so the effective
        # mantissa shift is 4 for segments 0–1 and (seg + 3) for segments ≥ 2.
        shift = np.where(seg < 2, 4, seg + 3).astype(np.int32)
        mantissa = (np.right_shift(magnitude, shift) & 0x0F).astype(np.uint8)
        aval = (seg.astype(np.uint8) << 4) | mantissa
        return (aval ^ mask).astype(np.uint8).tobytes()

voip.codecs.pcmu.PCMU

Bases: RTPCodec

G.711 mu-law codec (RFC 3551 §4.5.14).

PCMU is the ITU-T G.711 mu-law logarithmic companding codec for PSTN telephony, standardised in RFC 3551 with static payload type 0.

Both encode and decode are pure-NumPy and require no PyAV dependency.

Source code in voip/codecs/pcmu.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
class PCMU(RTPCodec):
    """G.711 mu-law codec ([RFC 3551 §4.5.14][]).

    PCMU is the ITU-T G.711 mu-law logarithmic companding codec for PSTN
    telephony, standardised in RFC 3551 with static payload type 0.

    Both encode and decode are pure-NumPy and require no PyAV dependency.

    [RFC 3551 §4.5.14]: https://datatracker.ietf.org/doc/html/rfc3551#section-4.5.14
    """

    payload_type: ClassVar[int] = 0
    encoding_name: ClassVar[str] = "pcmu"
    sample_rate_hz: ClassVar[int] = 8000
    rtp_clock_rate_hz: ClassVar[int] = 8000
    frame_size: ClassVar[int] = 160
    timestamp_increment: ClassVar[int] = 160
    channels: ClassVar[int] = 1

    @classmethod
    def decode(
        cls,
        payload: bytes,
        output_rate_hz: int,
        *,
        input_rate_hz: int | None = None,
    ) -> np.ndarray:
        raw = (~np.frombuffer(payload, dtype=np.uint8)).astype(np.uint8)
        sign = np.where(raw & 0x80, 1.0, -1.0)
        exp = ((raw >> 4) & 0x07).astype(np.int32)
        mantissa = (raw & 0x0F).astype(np.int32)
        # ITU-T G.711 §A: magnitude = (((mantissa << 3) + BIAS) << exp) - BIAS
        magnitude = (((mantissa << 3) + _MU_LAW_BIAS) << exp) - _MU_LAW_BIAS
        linear = (magnitude.astype(np.float32) / 32768.0).astype(np.float32)
        source_rate_hz = (
            input_rate_hz if input_rate_hz is not None else cls.sample_rate_hz
        )
        return cls.resample(
            (sign * linear).astype(np.float32), source_rate_hz, output_rate_hz
        )

    @classmethod
    def encode(cls, samples: np.ndarray) -> bytes:
        pcm = np.clip(np.round(samples * 32768.0), -32768, 32767).astype(np.int32)
        sign = np.where(pcm >= 0, 0x80, 0x00).astype(np.uint8)
        biased = np.minimum(np.abs(pcm) + _MU_LAW_BIAS, _MU_LAW_CLIP)
        exp = np.clip(
            np.floor(np.log2(np.maximum(biased, 1))).astype(np.int32) - 7, 0, 7
        )
        mantissa = ((biased >> (exp + 3)) & 0x0F).astype(np.uint8)
        return (
            (~(sign | (exp.astype(np.uint8) << 4) | mantissa))
            .astype(np.uint8)
            .tobytes()
        )

HD audio

These codecs require the pyav extra (pip install voip[pyav]).

voip.codecs.g722.G722

Bases: PyAVCodec

G.722 wideband audio codec (RFC 3551 §4.5.2).

G.722 is an ITU-T ADPCM wideband codec. Despite encoding audio at 16 000 Hz, the RTP timestamp clock runs at 8 000 Hz per RFC 3551 — a well-known quirk of the original specification.

The entire buffer is encoded at once to preserve the ADPCM predictor state across packet boundaries.

Source code in voip/codecs/g722.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
class G722(PyAVCodec):
    """G.722 wideband audio codec ([RFC 3551 §4.5.2][]).

    G.722 is an ITU-T ADPCM wideband codec.  Despite encoding audio at
    16 000 Hz, the RTP timestamp clock runs at 8 000 Hz per RFC 3551 —
    a well-known quirk of the original specification.

    The entire buffer is encoded at once to preserve the ADPCM predictor
    state across packet boundaries.

    [RFC 3551 §4.5.2]: https://datatracker.ietf.org/doc/html/rfc3551#section-4.5.2
    """

    payload_type: ClassVar[int] = 9
    encoding_name: ClassVar[str] = "g722"
    #: Actual wideband audio sample rate in Hz.
    sample_rate_hz: ClassVar[int] = 16000
    #: RTP timestamp clock rate per RFC 3551 (8 kHz despite 16 kHz audio).
    rtp_clock_rate_hz: ClassVar[int] = 8000
    #: Audio samples per 20 ms frame at 16 kHz.
    frame_size: ClassVar[int] = 320
    #: RTP timestamp ticks per frame at the 8 kHz clock.
    timestamp_increment: ClassVar[int] = 160
    channels: ClassVar[int] = 1

    @classmethod
    def decode(
        cls,
        payload: bytes,
        output_rate_hz: int,
        *,
        input_rate_hz: int | None = None,
    ) -> np.ndarray:
        return cls.decode_pcm(
            payload,
            "g722",
            output_rate_hz,
            input_rate_hz=cls.rtp_clock_rate_hz,
        )

    @classmethod
    def encode(cls, samples: np.ndarray) -> bytes:
        return cls.encode_pcm(samples, "g722", cls.sample_rate_hz)

    @classmethod
    def packetize(cls, audio: np.ndarray) -> Iterator[bytes]:
        encoded = cls.encode(audio)
        # G.722 2:1 sample-to-byte ratio: frame_size (320) samples → 160 bytes.
        payload_size = cls.frame_size // 2
        for i in range(0, len(encoded), payload_size):
            yield encoded[i : i + payload_size]

voip.codecs.opus.Opus

Bases: PyAVCodec

Opus audio codec (RFC 7587).

Opus is a highly flexible codec for interactive real-time speech and audio transmission. It uses dynamic payload type 111 and always operates at 48 000 Hz internally.

Incoming RTP payloads are wrapped in a minimal Ogg container before being passed to PyAV. Outbound PCM is encoded via libopus.

Source code in voip/codecs/opus.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
class Opus(PyAVCodec):
    """Opus audio codec ([RFC 7587][]).

    Opus is a highly flexible codec for interactive real-time speech and audio
    transmission. It uses dynamic payload type 111 and always operates at
    48 000 Hz internally.

    Incoming RTP payloads are wrapped in a minimal [Ogg][] container before
    being passed to PyAV. Outbound PCM is encoded via `libopus`.

    [RFC 7587]: https://datatracker.ietf.org/doc/html/rfc7587
    [Ogg]: https://wiki.xiph.org/Ogg
    """

    payload_type: ClassVar[int] = 111
    encoding_name: ClassVar[str] = "opus"
    sample_rate_hz: ClassVar[int] = 48000
    rtp_clock_rate_hz: ClassVar[int] = 48000
    frame_size: ClassVar[int] = 960
    timestamp_increment: ClassVar[int] = 960
    channels: ClassVar[int] = 2

    @staticmethod
    def _ogg_crc32(data: bytes) -> int:
        """Compute an Ogg CRC32 checksum (polynomial 0x04C11DB7).

        Args:
            data: Bytes to checksum.

        Returns:
            32-bit CRC value.
        """
        crc = 0
        for byte in data:
            crc ^= byte << 24
            for _ in range(8):
                crc = (crc << 1) ^ (0x04C11DB7 if crc & 0x80000000 else 0)
        return crc & 0xFFFFFFFF

    @classmethod
    def _ogg_page(
        cls,
        header_type: int,
        granule_position: int,
        serial_number: int,
        sequence_number: int,
        packets: list[bytes],
    ) -> bytes:
        """Build a single Ogg page ([RFC 3533](https://datatracker.ietf.org/doc/html/rfc3533)).

        Args:
            header_type: Page header type flags (e.g. `0x02` for BOS, `0x04` for EOS).
            granule_position: Granule position for this page.
            serial_number: Stream serial number.
            sequence_number: Page sequence number within the stream.
            packets: Packet byte strings to include in this page.

        Returns:
            Complete Ogg page bytes including CRC.
        """
        lacing: list[int] = []
        for packet in packets:
            remaining = len(packet)
            while remaining >= 255:
                lacing.append(255)
                remaining -= 255
            lacing.append(remaining)
        header = struct.pack(
            "<4sBBqIIIB",
            b"OggS",
            0,  # stream structure version
            header_type,
            granule_position,
            serial_number,
            sequence_number,
            0,  # CRC placeholder
            len(lacing),
        ) + bytes(lacing)
        page = header + b"".join(packets)
        return page[:22] + struct.pack("<I", cls._ogg_crc32(page)) + page[26:]

    @classmethod
    def _ogg_container(cls, packet: bytes) -> bytes:
        """Wrap a raw Opus RTP payload in a minimal Ogg Opus container.

        Produces a three-page Ogg stream: BOS (OpusHead), comment
        (OpusTags), and the single data page.  Opus always uses 48 000 Hz
        internally ([RFC 7587 §4](https://datatracker.ietf.org/doc/html/rfc7587#section-4)).

        Args:
            packet: Raw Opus RTP payload bytes.

        Returns:
            Ogg Opus container bytes suitable for PyAV decoding.
        """
        serial_number = int.from_bytes(os.urandom(4), "little")
        vendor = b"voip"
        opus_head = struct.pack(
            "<8sBBHIhB",
            b"OpusHead",
            1,  # version
            1,  # channel count (mono)
            3840,  # pre-skip: 80 ms at 48 kHz (RFC 7587)
            cls.sample_rate_hz,
            0,  # output gain
            0,  # channel mapping family (mono/stereo)
        )
        opus_tags = (
            struct.pack("<8sI", b"OpusTags", len(vendor))
            + vendor
            + struct.pack("<I", 0)  # zero user comments
        )
        return b"".join(
            [
                cls._ogg_page(0x02, 0, serial_number, 0, [opus_head]),  # BOS
                cls._ogg_page(0x00, 0, serial_number, 1, [opus_tags]),
                cls._ogg_page(0x04, 0, serial_number, 2, [packet]),
            ]
        )

    @classmethod
    def decode(
        cls,
        payload: bytes,
        output_rate_hz: int,
        *,
        input_rate_hz: int | None = None,
    ) -> np.ndarray:
        return cls.decode_pcm(cls._ogg_container(payload), "ogg", output_rate_hz)

    @classmethod
    def encode(cls, samples: np.ndarray) -> bytes:
        return cls.encode_pcm(samples, "libopus", cls.sample_rate_hz)

Registry

voip.codecs.get(encoding_name)

Get a codec class by its SDP encoding name.

Parameters:

Name Type Description Default
encoding_name str

SDP encoding name, case-insensitive (e.g. "opus", "G722", "PCMA").

required

Returns:

Type Description
type[RTPCodec]

Matching codec class.

Raises:

Type Description
NotImplementedError

When no registered codec matches encoding_name.

Source code in voip/codecs/__init__.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def get(encoding_name: str) -> type[RTPCodec]:
    """Get a codec class by its SDP encoding name.

    Args:
        encoding_name: SDP encoding name, case-insensitive
            (e.g. `"opus"`, `"G722"`, `"PCMA"`).

    Returns:
        Matching codec class.

    Raises:
        NotImplementedError: When no registered codec matches *encoding_name*.
    """
    try:
        return REGISTRY[encoding_name.lower()]
    except KeyError:
        raise NotImplementedError(
            f"Unsupported codec: {encoding_name!r}. Supported: {list(REGISTRY)!r}"
        )

Base classes

voip.codecs.base.RTPCodec

Base class for RTP audio codecs.

Concrete implementations: Opus, G722, PCMA, PCMU.

All codec implementations are stateless: every method is a classmethod or staticmethod and codecs are referenced as type[RTPCodec], never instantiated.

Concrete subclasses define codec-specific class variables and override decode, encode, and optionally packetize.

Subclasses may use the shared PyAV-backed helpers or implement decode and encode using alternative backends such as NumPy.

Subclasses that produce variable-length output across frames (e.g. G.722 ADPCM) should override packetize to encode the whole buffer at once and preserve predictor state.

Subclasses that require PyAV additionally inherit from PyAVCodec.

Source code in voip/codecs/base.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
class RTPCodec:
    """Base class for RTP audio codecs.

    Concrete implementations: [`Opus`][voip.codecs.Opus],
    [`G722`][voip.codecs.G722], [`PCMA`][voip.codecs.pcma.PCMA],
    [`PCMU`][voip.codecs.pcmu.PCMU].

    All codec implementations are stateless: every method is a classmethod or
    staticmethod and codecs are referenced as `type[RTPCodec]`, never
    instantiated.

    Concrete subclasses define codec-specific class variables and override
    [`decode`][voip.codecs.base.RTPCodec.decode],
    [`encode`][voip.codecs.base.RTPCodec.encode], and optionally
    [`packetize`][voip.codecs.base.RTPCodec.packetize].

    Subclasses may use the shared PyAV-backed helpers or implement
    [`decode`][voip.codecs.base.RTPCodec.decode] and
    [`encode`][voip.codecs.base.RTPCodec.encode] using alternative backends
    such as NumPy.

    Subclasses that produce variable-length output across frames (e.g. G.722
    ADPCM) should override `packetize` to encode the whole buffer at once and
    preserve predictor state.

    Subclasses that require [PyAV][] additionally inherit from
    [`PyAVCodec`][voip.codecs.av.PyAVCodec].

    [PyAV]: https://pyav.basswood-io.com/
    """

    payload_type: ClassVar[int]
    """RTP payload type number."""

    encoding_name: ClassVar[str]
    """SDP encoding name (lowercase)."""

    sample_rate_hz: ClassVar[int]
    """Actual audio sample rate in Hz."""

    rtp_clock_rate_hz: ClassVar[int]
    """RTP timestamp clock rate in Hz (may differ from `sample_rate_hz`)."""

    frame_size: ClassVar[int]
    """Audio samples per 20 ms RTP frame at `sample_rate_hz`."""

    timestamp_increment: ClassVar[int]
    """RTP timestamp ticks per frame at `rtp_clock_rate_hz`."""

    channels: ClassVar[int]
    """Channel count (1 = mono, 2 = stereo)."""

    @classmethod
    def resample(
        cls, audio: np.ndarray, source_rate_hz: int, destination_rate_hz: int
    ) -> np.ndarray:
        """Resample *audio* from *source_rate_hz* to *destination_rate_hz*.

        Uses linear interpolation via [`numpy.interp`][].

        Args:
            audio: Float32 mono PCM array.
            source_rate_hz: Sample rate of *audio* in Hz.
            destination_rate_hz: Target sample rate in Hz.

        Returns:
            Resampled float32 array at *destination_rate_hz* Hz, or *audio*
            unchanged when both rates are equal.
        """
        if source_rate_hz == destination_rate_hz:
            return audio
        if len(audio) == 0:
            return np.empty(0, dtype=np.float32)
        n_out = max(1, round(len(audio) * destination_rate_hz / source_rate_hz))
        return np.interp(
            np.linspace(0, len(audio) - 1, n_out),
            np.arange(len(audio)),
            audio,
        ).astype(np.float32)

    @classmethod
    def to_payload_format(cls) -> RTPPayloadFormat:
        """Create an [`RTPPayloadFormat`][voip.sdp.types.RTPPayloadFormat] for SDP negotiation.

        Uses `rtp_clock_rate_hz` as the SDP sample rate, which is correct
        per RFC 3551 (e.g. G.722 advertises 8000 Hz in SDP even though the
        actual audio runs at 16000 Hz).

        Returns:
            Payload format descriptor for this codec.
        """
        return RTPPayloadFormat(
            payload_type=cls.payload_type,
            encoding_name=cls.encoding_name,
            sample_rate=cls.rtp_clock_rate_hz,
            channels=cls.channels,
        )

    @classmethod
    def decode(
        cls,
        payload: bytes,
        output_rate_hz: int,
        *,
        input_rate_hz: int | None = None,
    ) -> np.ndarray:
        """Decode an RTP payload to float32 mono PCM.

        Override in subclasses to implement codec-specific decoding.

        Args:
            payload: Raw RTP payload bytes.
            output_rate_hz: Target sample rate in Hz.
            input_rate_hz: Input clock rate override in Hz, or `None` to use
                the subclass default.

        Returns:
            Float32 mono PCM array at *output_rate_hz* Hz.
        """
        raise NotImplementedError(f"{cls.__name__} does not implement decode.")

    @classmethod
    def encode(cls, samples: np.ndarray) -> bytes:
        """Encode float32 mono PCM to an RTP payload.

        Override in subclasses to implement codec-specific encoding.

        Args:
            samples: Float32 mono PCM at `sample_rate_hz` Hz.

        Returns:
            Encoded bytes for one RTP payload.
        """
        raise NotImplementedError(f"{cls.__name__} does not implement encode.")

    @classmethod
    def packetize(cls, audio: np.ndarray) -> Iterator[bytes]:
        """Encode *audio* and yield one encoded payload per 20 ms RTP frame.

        The default implementation encodes one `frame_size` chunk at a time
        using `encode`. Override in subclasses (e.g. G.722) where the entire
        buffer must be encoded at once to preserve codec state.

        Args:
            audio: Float32 mono PCM at `sample_rate_hz` Hz.

        Yields:
            Encoded payload bytes, one per RTP packet.
        """
        for i in range(0, len(audio), cls.frame_size):
            yield cls.encode(audio[i : i + cls.frame_size])

channels class-attribute

Channel count (1 = mono, 2 = stereo).

encoding_name class-attribute

SDP encoding name (lowercase).

frame_size class-attribute

Audio samples per 20 ms RTP frame at sample_rate_hz.

payload_type class-attribute

RTP payload type number.

rtp_clock_rate_hz class-attribute

RTP timestamp clock rate in Hz (may differ from sample_rate_hz).

sample_rate_hz class-attribute

Actual audio sample rate in Hz.

timestamp_increment class-attribute

RTP timestamp ticks per frame at rtp_clock_rate_hz.

decode(payload, output_rate_hz, *, input_rate_hz=None) classmethod

Decode an RTP payload to float32 mono PCM.

Override in subclasses to implement codec-specific decoding.

Parameters:

Name Type Description Default
payload bytes

Raw RTP payload bytes.

required
output_rate_hz int

Target sample rate in Hz.

required
input_rate_hz int | None

Input clock rate override in Hz, or None to use the subclass default.

None

Returns:

Type Description
ndarray

Float32 mono PCM array at output_rate_hz Hz.

Source code in voip/codecs/base.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
@classmethod
def decode(
    cls,
    payload: bytes,
    output_rate_hz: int,
    *,
    input_rate_hz: int | None = None,
) -> np.ndarray:
    """Decode an RTP payload to float32 mono PCM.

    Override in subclasses to implement codec-specific decoding.

    Args:
        payload: Raw RTP payload bytes.
        output_rate_hz: Target sample rate in Hz.
        input_rate_hz: Input clock rate override in Hz, or `None` to use
            the subclass default.

    Returns:
        Float32 mono PCM array at *output_rate_hz* Hz.
    """
    raise NotImplementedError(f"{cls.__name__} does not implement decode.")

encode(samples) classmethod

Encode float32 mono PCM to an RTP payload.

Override in subclasses to implement codec-specific encoding.

Parameters:

Name Type Description Default
samples ndarray

Float32 mono PCM at sample_rate_hz Hz.

required

Returns:

Type Description
bytes

Encoded bytes for one RTP payload.

Source code in voip/codecs/base.py
150
151
152
153
154
155
156
157
158
159
160
161
162
@classmethod
def encode(cls, samples: np.ndarray) -> bytes:
    """Encode float32 mono PCM to an RTP payload.

    Override in subclasses to implement codec-specific encoding.

    Args:
        samples: Float32 mono PCM at `sample_rate_hz` Hz.

    Returns:
        Encoded bytes for one RTP payload.
    """
    raise NotImplementedError(f"{cls.__name__} does not implement encode.")

packetize(audio) classmethod

Encode audio and yield one encoded payload per 20 ms RTP frame.

The default implementation encodes one frame_size chunk at a time using encode. Override in subclasses (e.g. G.722) where the entire buffer must be encoded at once to preserve codec state.

Parameters:

Name Type Description Default
audio ndarray

Float32 mono PCM at sample_rate_hz Hz.

required

Yields:

Type Description
bytes

Encoded payload bytes, one per RTP packet.

Source code in voip/codecs/base.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
@classmethod
def packetize(cls, audio: np.ndarray) -> Iterator[bytes]:
    """Encode *audio* and yield one encoded payload per 20 ms RTP frame.

    The default implementation encodes one `frame_size` chunk at a time
    using `encode`. Override in subclasses (e.g. G.722) where the entire
    buffer must be encoded at once to preserve codec state.

    Args:
        audio: Float32 mono PCM at `sample_rate_hz` Hz.

    Yields:
        Encoded payload bytes, one per RTP packet.
    """
    for i in range(0, len(audio), cls.frame_size):
        yield cls.encode(audio[i : i + cls.frame_size])

resample(audio, source_rate_hz, destination_rate_hz) classmethod

Resample audio from source_rate_hz to destination_rate_hz.

Uses linear interpolation via numpy.interp.

Parameters:

Name Type Description Default
audio ndarray

Float32 mono PCM array.

required
source_rate_hz int

Sample rate of audio in Hz.

required
destination_rate_hz int

Target sample rate in Hz.

required

Returns:

Type Description
ndarray

Resampled float32 array at destination_rate_hz Hz, or audio

ndarray

unchanged when both rates are equal.

Source code in voip/codecs/base.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
@classmethod
def resample(
    cls, audio: np.ndarray, source_rate_hz: int, destination_rate_hz: int
) -> np.ndarray:
    """Resample *audio* from *source_rate_hz* to *destination_rate_hz*.

    Uses linear interpolation via [`numpy.interp`][].

    Args:
        audio: Float32 mono PCM array.
        source_rate_hz: Sample rate of *audio* in Hz.
        destination_rate_hz: Target sample rate in Hz.

    Returns:
        Resampled float32 array at *destination_rate_hz* Hz, or *audio*
        unchanged when both rates are equal.
    """
    if source_rate_hz == destination_rate_hz:
        return audio
    if len(audio) == 0:
        return np.empty(0, dtype=np.float32)
    n_out = max(1, round(len(audio) * destination_rate_hz / source_rate_hz))
    return np.interp(
        np.linspace(0, len(audio) - 1, n_out),
        np.arange(len(audio)),
        audio,
    ).astype(np.float32)

to_payload_format() classmethod

Create an RTPPayloadFormat for SDP negotiation.

Uses rtp_clock_rate_hz as the SDP sample rate, which is correct per RFC 3551 (e.g. G.722 advertises 8000 Hz in SDP even though the actual audio runs at 16000 Hz).

Returns:

Type Description
RTPPayloadFormat

Payload format descriptor for this codec.

Source code in voip/codecs/base.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
@classmethod
def to_payload_format(cls) -> RTPPayloadFormat:
    """Create an [`RTPPayloadFormat`][voip.sdp.types.RTPPayloadFormat] for SDP negotiation.

    Uses `rtp_clock_rate_hz` as the SDP sample rate, which is correct
    per RFC 3551 (e.g. G.722 advertises 8000 Hz in SDP even though the
    actual audio runs at 16000 Hz).

    Returns:
        Payload format descriptor for this codec.
    """
    return RTPPayloadFormat(
        payload_type=cls.payload_type,
        encoding_name=cls.encoding_name,
        sample_rate=cls.rtp_clock_rate_hz,
        channels=cls.channels,
    )

voip.codecs.av.PyAVCodec

Bases: RTPCodec

RTP codec that decodes and encodes audio via PyAV.

Concrete implementations: Opus, G722.

Source code in voip/codecs/av.py
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
class PyAVCodec(RTPCodec):
    """RTP codec that decodes and encodes audio via [PyAV][].

    Concrete implementations: [`Opus`][voip.codecs.Opus],
    [`G722`][voip.codecs.G722].

    [PyAV]: https://pyav.basswood-io.com/
    """

    @classmethod
    def decode_pcm(
        cls,
        data: bytes,
        av_format: str,
        output_rate_hz: int,
        *,
        input_rate_hz: int | None = None,
    ) -> np.ndarray:
        """Decode raw audio bytes via PyAV into float32 mono PCM.

        Args:
            data: Raw audio bytes in the codec's wire format.
            av_format: PyAV format string (e.g. `"ogg"`, `"alaw"`).
            output_rate_hz: Target sample rate in Hz.
            input_rate_hz: Input clock rate hint for the PyAV decoder, or
                `None` for self-describing formats like Ogg.

        Returns:
            Float32 mono PCM array at *output_rate_hz* Hz.
        """
        resampler = av.audio.resampler.AudioResampler(
            format="fltp", layout="mono", rate=output_rate_hz
        )
        frames: list[np.ndarray] = []
        with av.open(
            io.BytesIO(data),
            mode="r",
            format=av_format,
            options=(
                {"sample_rate": str(input_rate_hz)} if input_rate_hz is not None else {}
            ),
        ) as container:
            for frame in container.decode(audio=0):
                for resampled in resampler.resample(frame):
                    frames.append(resampled.to_ndarray().flatten())
        for resampled in resampler.resample(None):
            frames.append(resampled.to_ndarray().flatten())
        return np.concatenate(frames) if frames else np.array([], dtype=np.float32)

    @classmethod
    def encode_pcm(
        cls,
        samples: np.ndarray,
        av_codec_name: str,
        sample_rate_hz: int,
    ) -> bytes:
        """Encode float32 mono PCM to raw codec bytes via PyAV.

        Args:
            samples: Float32 mono PCM array in the range `[-1, 1]`.
            av_codec_name: PyAV codec name (e.g. `"g722"` or `"libopus"`).
            sample_rate_hz: Sample rate of *samples* in Hz.

        Returns:
            Encoded audio bytes.
        """
        codec: av.AudioCodecContext = cast(
            av.AudioCodecContext, av.CodecContext.create(av_codec_name, "w")
        )
        codec.sample_rate = sample_rate_hz
        codec.format = av.AudioFormat("s16")
        codec.layout = av.AudioLayout("mono")
        codec.open()
        pcm = np.clip(np.round(samples * 32768.0), -32768, 32767).astype(np.int16)
        frame = av.AudioFrame.from_ndarray(
            pcm[np.newaxis, :], format="s16", layout="mono"
        )
        frame.sample_rate = sample_rate_hz
        frame.pts = 0
        return b"".join(
            bytes(packet)
            for segment in (codec.encode(frame), codec.encode(None))
            for packet in segment
        )

decode_pcm(data, av_format, output_rate_hz, *, input_rate_hz=None) classmethod

Decode raw audio bytes via PyAV into float32 mono PCM.

Parameters:

Name Type Description Default
data bytes

Raw audio bytes in the codec's wire format.

required
av_format str

PyAV format string (e.g. "ogg", "alaw").

required
output_rate_hz int

Target sample rate in Hz.

required
input_rate_hz int | None

Input clock rate hint for the PyAV decoder, or None for self-describing formats like Ogg.

None

Returns:

Type Description
ndarray

Float32 mono PCM array at output_rate_hz Hz.

Source code in voip/codecs/av.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
@classmethod
def decode_pcm(
    cls,
    data: bytes,
    av_format: str,
    output_rate_hz: int,
    *,
    input_rate_hz: int | None = None,
) -> np.ndarray:
    """Decode raw audio bytes via PyAV into float32 mono PCM.

    Args:
        data: Raw audio bytes in the codec's wire format.
        av_format: PyAV format string (e.g. `"ogg"`, `"alaw"`).
        output_rate_hz: Target sample rate in Hz.
        input_rate_hz: Input clock rate hint for the PyAV decoder, or
            `None` for self-describing formats like Ogg.

    Returns:
        Float32 mono PCM array at *output_rate_hz* Hz.
    """
    resampler = av.audio.resampler.AudioResampler(
        format="fltp", layout="mono", rate=output_rate_hz
    )
    frames: list[np.ndarray] = []
    with av.open(
        io.BytesIO(data),
        mode="r",
        format=av_format,
        options=(
            {"sample_rate": str(input_rate_hz)} if input_rate_hz is not None else {}
        ),
    ) as container:
        for frame in container.decode(audio=0):
            for resampled in resampler.resample(frame):
                frames.append(resampled.to_ndarray().flatten())
    for resampled in resampler.resample(None):
        frames.append(resampled.to_ndarray().flatten())
    return np.concatenate(frames) if frames else np.array([], dtype=np.float32)

encode_pcm(samples, av_codec_name, sample_rate_hz) classmethod

Encode float32 mono PCM to raw codec bytes via PyAV.

Parameters:

Name Type Description Default
samples ndarray

Float32 mono PCM array in the range [-1, 1].

required
av_codec_name str

PyAV codec name (e.g. "g722" or "libopus").

required
sample_rate_hz int

Sample rate of samples in Hz.

required

Returns:

Type Description
bytes

Encoded audio bytes.

Source code in voip/codecs/av.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
@classmethod
def encode_pcm(
    cls,
    samples: np.ndarray,
    av_codec_name: str,
    sample_rate_hz: int,
) -> bytes:
    """Encode float32 mono PCM to raw codec bytes via PyAV.

    Args:
        samples: Float32 mono PCM array in the range `[-1, 1]`.
        av_codec_name: PyAV codec name (e.g. `"g722"` or `"libopus"`).
        sample_rate_hz: Sample rate of *samples* in Hz.

    Returns:
        Encoded audio bytes.
    """
    codec: av.AudioCodecContext = cast(
        av.AudioCodecContext, av.CodecContext.create(av_codec_name, "w")
    )
    codec.sample_rate = sample_rate_hz
    codec.format = av.AudioFormat("s16")
    codec.layout = av.AudioLayout("mono")
    codec.open()
    pcm = np.clip(np.round(samples * 32768.0), -32768, 32767).astype(np.int16)
    frame = av.AudioFrame.from_ndarray(
        pcm[np.newaxis, :], format="s16", layout="mono"
    )
    frame.sample_rate = sample_rate_hz
    frame.pts = 0
    return b"".join(
        bytes(packet)
        for segment in (codec.encode(frame), codec.encode(None))
        for packet in segment
    )