Skip to content

Instantly share code, notes, and snippets.

@flying-sheep
Last active December 16, 2025 12:25
Show Gist options
  • Select an option

  • Save flying-sheep/961f9a1d9d22142f4265ae22c8d22dc3 to your computer and use it in GitHub Desktop.

Select an option

Save flying-sheep/961f9a1d9d22142f4265ae22c8d22dc3 to your computer and use it in GitHub Desktop.
profile different pandas string array types
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.13"
# dependencies = [ "scanpy>=1.12rc0", "pyarrow", "memory-profiler" ]
# ///
"""Profile arrow memory."""
from __future__ import annotations
import gc
import io
from string import ascii_lowercase
from subprocess import run
import sys
from typing import TYPE_CHECKING
import numpy as np
import pandas as pd
from pandas.core.dtypes.common import is_object_dtype
from pandas._libs import lib
from memory_profiler import profile
from scanpy._utils.random import random_str
if TYPE_CHECKING:
from typing import Literal
from numpy.typing import NDArray
type Backend = Literal["tracemalloc", "psutil", "psutil_pss", "psutil_uss", "posix"]
backend: Backend = "psutil"
file = io.StringIO()
def mk_raw(dtype: np.dtype | None = None) -> NDArray[np.str_]:
arr = random_str(1_000_000, length=10, alphabet=ascii_lowercase, rng=0)
return arr.astype(dtype) if dtype else arr
def memory_usage(array: pd.api.extensions.ExtensionArray) -> int:
if hasattr(array, "memory_usage"):
return array.memory_usage(deep=True)
v = array.nbytes
if is_object_dtype(array.dtype):
assert isinstance(array, pd.arrays.NumpyExtensionArray)
v += lib.memory_usage_of_objects(array._ndarray)
return v
@profile(backend=backend, stream=file)
def profile_numpy_unicode_mem() -> int:
arr = pd.arrays.NumpyExtensionArray(mk_raw()); gc.collect()
bytes_ = memory_usage(arr); gc.collect()
arr.tolist(); gc.collect()
del arr; gc.collect()
return bytes_
@profile(backend=backend, stream=file)
def profile_numpy_obj_mem() -> int:
arr = pd.arrays.NumpyExtensionArray(mk_raw(object)); gc.collect()
bytes_ = memory_usage(arr); gc.collect()
arr.tolist(); gc.collect()
del arr; gc.collect()
return bytes_
@profile(backend=backend, stream=file)
def profile_numpy_str_mem_na() -> int:
arr = pd.arrays.NumpyExtensionArray(mk_raw(np.dtypes.StringDType(na_object=None))); gc.collect()
bytes_ = memory_usage(arr); gc.collect()
arr.tolist(); gc.collect()
del arr; gc.collect()
return bytes_
@profile(backend=backend, stream=file)
def profile_numpy_str_mem() -> int:
arr = pd.arrays.NumpyExtensionArray(mk_raw(np.dtypes.StringDType())); gc.collect()
bytes_ = memory_usage(arr); gc.collect()
arr.tolist(); gc.collect()
del arr; gc.collect()
return bytes_
@profile(backend=backend, stream=file)
def profile_python_mem() -> int:
arr = pd.array(mk_raw(), dtype=pd.StringDtype("python")); gc.collect()
bytes_ = memory_usage(arr); gc.collect()
arr.tolist(); gc.collect()
del arr; gc.collect()
return bytes_
@profile(backend=backend, stream=file)
def profile_arrow_mem() -> int:
arr = pd.array(mk_raw(), dtype=pd.StringDtype("pyarrow")); gc.collect()
bytes_ = memory_usage(arr); gc.collect()
arr.tolist(); gc.collect()
del arr; gc.collect()
return bytes_
funcs = [
profile_numpy_unicode_mem,
profile_numpy_obj_mem,
profile_numpy_str_mem_na,
profile_numpy_str_mem,
profile_python_mem,
profile_arrow_mem,
]
if __name__ == "__main__":
if len(sys.argv) == 2:
f = funcs[int(sys.argv[1])]
bytes_ = f()
out = "\n".join(file.getvalue().splitlines()[2:-3])
print(f"{f.__name__} (Self-reported: {bytes_ / 1e6:.2f} MB):")
print(f"Profiled:\n{out}\n")
sys.exit(0)
for i in range(len(funcs)):
run([sys.executable, sys.argv[0], str(i)])
[envs.default]
detached = true
installer = "uv"
python = "3.13"
dependencies = [ "scanpy>=1.12rc0", "pyarrow", "memory-profiler" ]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment