article thumbnail
Python FFI
Call native C from python
#python, #programming

Python FFI is how you wire pure Python to native code without writing a CPython extension by hand. You get to keep Python for orchestration while borrowing the speed and reach of C libraries. This deep dive focuses on two battle tested tools:

No magic here: FFI works because your OS loader can map a shared library (ELF on Linux, Mach-O on macOS, PE on Windows) into the process and expose its exported symbols. Python then calls those symbols through the platform calling convention. Your job is to describe the function prototypes and data layouts accurately and to manage memory ownership with care.

This article covers:

If you already know the basics, jump straight to the two examples.

ABI vs API modes

Pick ABI mode for exploration and light bindings. Pick API mode for production grade, wide surface bindings, especially when the header uses lots of typedefs and structs.

Finding and loading libraries

Portable loading is half the battle.

Precise type mapping

You must match sizes and signedness.

Strings, buffers, and ownership

Error handling and diagnostics

Callbacks and reentrancy

Arrays, slicing, and NumPy

Threading and the GIL

Variadic functions

Packaging and deployment checklist

Debugging strategies


Example A: a safe ctypes wrapper around zlib compress and uncompress

This shows how to map function prototypes, handle output buffers, and raise errors early.

import ctypes
import ctypes.util

# Resolve the zlib shared library name across platforms
def load_zlib():
    name = ctypes.util.find_library("z") or ctypes.util.find_library("zlib")
    if not name:
        # Common fallbacks
        candidates = ["zlib1.dll", "libz.so.1", "libz.dylib"]
        for c in candidates:
            try:
                return ctypes.CDLL(c, use_errno=True)
            except OSError:
                pass
        raise OSError("could not find zlib on this system")
    return ctypes.CDLL(name, use_errno=True)

z = load_zlib()

# zlib types vary by platform, but uLong and uLongf are commonly unsigned long.
c_uLong = ctypes.c_ulong
c_uLong_p = ctypes.POINTER(c_uLong)
c_Byte = ctypes.c_ubyte
c_Byte_p = ctypes.POINTER(c_Byte)

# int compress2(Bytef *dest, uLongf *destLen, const Bytef *source, uLong sourceLen, int level);
z.compress2.argtypes = [c_Byte_p, c_uLong_p, c_Byte_p, c_uLong, ctypes.c_int]
z.compress2.restype = ctypes.c_int

# uLong compressBound(uLong sourceLen);
z.compressBound.argtypes = [c_uLong]
z.compressBound.restype = c_uLong

# int uncompress(Bytef *dest, uLongf *destLen, const Bytef *source, uLong sourceLen);
z.uncompress.argtypes = [c_Byte_p, c_uLong_p, c_Byte_p, c_uLong]
z.uncompress.restype = ctypes.c_int

# zlib return codes
Z_OK = 0
Z_MEM_ERROR = -4
Z_BUF_ERROR = -5
Z_STREAM_ERROR = -2

class ZlibError(RuntimeError):
    pass

def _errcheck(code, where):
    if code == Z_OK:
        return
    err = ctypes.get_errno()
    raise ZlibError(f"{where} failed with code {code}, errno {err}")

def zlib_compress(data: bytes, level: int = 6) -> bytes:
    if not isinstance(data, (bytes, bytearray, memoryview)):
        raise TypeError("data must be bytes like")
    src = (c_Byte * len(data)).from_buffer_copy(bytearray(data))
    src_len = c_uLong(len(data))

    # Ask zlib how big the output can be
    dest_cap = z.compressBound(src_len)
    dest = (c_Byte * dest_cap)()
    dest_len = c_uLong(dest_cap)

    rc = z.compress2(dest, ctypes.byref(dest_len), src, src_len, int(level))
    _errcheck(rc, "compress2")

    # Slice the buffer to the actual compressed size
    view = memoryview(dest)[:dest_len.value]
    return bytes(view)

def zlib_uncompress(data: bytes, expected_size: int | None = None) -> bytes:
    if not isinstance(data, (bytes, bytearray, memoryview)):
        raise TypeError("data must be bytes like")
    src = (c_Byte * len(data)).from_buffer_copy(bytearray(data))
    src_len = c_uLong(len(data))

    # If caller knows the uncompressed size, allocate exactly once
    # Otherwise, grow geometrically until uncompress succeeds
    cap = expected_size or max(64, len(data) * 4)
    for _ in range(6):  # hard cap attempts
        dest = (c_Byte * cap)()
        dest_len = c_uLong(cap)
        rc = z.uncompress(dest, ctypes.byref(dest_len), src, src_len)
        if rc == Z_OK:
            return bytes(memoryview(dest)[:dest_len.value])
        if rc != Z_BUF_ERROR:
            _errcheck(rc, "uncompress")
        cap *= 2  # try again with a bigger buffer
    raise ZlibError("uncompress ran out of attempts to grow buffer")

# Quick check
blob = b"ffi " * 1000
compressed = zlib_compress(blob, level=6)
decompressed = zlib_uncompress(compressed, expected_size=len(blob))
assert decompressed == blob

Key takeaways:


Example B: cffi with callbacks, arrays, and automatic GC

This example binds to the C library qsort and uses a Python comparator. It also shows using ffi.gc to attach a destructor to a pointer that must be freed.

import sys
from cffi import FFI

ffi = FFI()

ffi.cdef(
    """
    typedef int (*cmp_fn)(const void *a, const void *b);

    void qsort(void *base, size_t nmemb, size_t size, cmp_fn compar);

    // We will also use malloc and free to show ffi.gc
    void *malloc(size_t size);
    void free(void *ptr);
    """
)

# Resolve the C runtime that provides qsort, malloc, free
if sys.platform.startswith("win"):
    libc = ffi.dlopen("msvcrt.dll")
else:
    # libc name varies; on many Linux systems it is libc.so.6
    # If this fails on your distro, use ctypes.util.find_library to locate libc
    libc = ffi.dlopen("libc.so.6")

# Prepare an array of 32 bit ints
N = 10
arr = ffi.new("int[]", [7, 1, 2, 9, 5, 3, 8, 4, 6, 0])

# Create a comparator that sorts ascending
@ffi.callback("int(const void *, const void *)")
def cmp(a, b):
    # Cast the void* pointers back to int*
    ia = ffi.cast("const int *", a)[0]
    ib = ffi.cast("const int *", b)[0]
    # qsort expects negative, zero, or positive
    return -1 if ia < ib else (1 if ia > ib else 0)

# Call qsort. sizeof(int) is computed by cffi
libc.qsort(arr, N, ffi.sizeof("int"), cmp)

print([arr[i] for i in range(N)])  # sorted values 0..9

# Show ffi.gc with malloc
nbytes = 1024
raw = libc.malloc(nbytes)
buf = ffi.gc(raw, libc.free)  # buf will be freed when GC runs

# Turn it into a typed pointer for convenience
p = ffi.cast("unsigned char *", buf)
for i in range(nbytes):
    p[i] = i % 256
print(int(p[0]), int(p[255]))

Important notes:


Building a tidy Pythonic wrapper

Clean bindings are pleasant to use. Tips:

When to prefer a C shim

Sometimes the safest path is to write 20 lines of C:

Your cffi API mode or a tiny CPython extension can expose a flat, FFI friendly API that you then bind from Python cleanly.

Final thoughts

FFI turns Python into a control plane for high performance C. The craft is in the details: correct prototypes, careful ownership, clear errors, and boring predictability around loading and packaging. Start with ABI bindings to learn the surface, then promote the hot path or tricky pieces to a minimal API mode shim. With that split, you get the best of both worlds: Python ergonomics with native speed where it matters.