Last active
December 16, 2025 12:25
-
-
Save flying-sheep/961f9a1d9d22142f4265ae22c8d22dc3 to your computer and use it in GitHub Desktop.
profile different pandas string array types
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # /// script | |
| # requires-python = ">=3.13" | |
| # dependencies = [ "scanpy>=1.12rc0", "pyarrow", "memory-profiler" ] | |
| # /// | |
| """Profile arrow memory.""" | |
| from __future__ import annotations | |
| import gc | |
| import io | |
| from string import ascii_lowercase | |
| from subprocess import run | |
| import sys | |
| from typing import TYPE_CHECKING | |
| import numpy as np | |
| import pandas as pd | |
| from pandas.core.dtypes.common import is_object_dtype | |
| from pandas._libs import lib | |
| from memory_profiler import profile | |
| from scanpy._utils.random import random_str | |
| if TYPE_CHECKING: | |
| from typing import Literal | |
| from numpy.typing import NDArray | |
| type Backend = Literal["tracemalloc", "psutil", "psutil_pss", "psutil_uss", "posix"] | |
| backend: Backend = "psutil" | |
| file = io.StringIO() | |
| def mk_raw(dtype: np.dtype | None = None) -> NDArray[np.str_]: | |
| arr = random_str(1_000_000, length=10, alphabet=ascii_lowercase, rng=0) | |
| return arr.astype(dtype) if dtype else arr | |
| def memory_usage(array: pd.api.extensions.ExtensionArray) -> int: | |
| if hasattr(array, "memory_usage"): | |
| return array.memory_usage(deep=True) | |
| v = array.nbytes | |
| if is_object_dtype(array.dtype): | |
| assert isinstance(array, pd.arrays.NumpyExtensionArray) | |
| v += lib.memory_usage_of_objects(array._ndarray) | |
| return v | |
| @profile(backend=backend, stream=file) | |
| def profile_numpy_unicode_mem() -> int: | |
| arr = pd.arrays.NumpyExtensionArray(mk_raw()); gc.collect() | |
| bytes_ = memory_usage(arr); gc.collect() | |
| arr.tolist(); gc.collect() | |
| del arr; gc.collect() | |
| return bytes_ | |
| @profile(backend=backend, stream=file) | |
| def profile_numpy_obj_mem() -> int: | |
| arr = pd.arrays.NumpyExtensionArray(mk_raw(object)); gc.collect() | |
| bytes_ = memory_usage(arr); gc.collect() | |
| arr.tolist(); gc.collect() | |
| del arr; gc.collect() | |
| return bytes_ | |
| @profile(backend=backend, stream=file) | |
| def profile_numpy_str_mem_na() -> int: | |
| arr = pd.arrays.NumpyExtensionArray(mk_raw(np.dtypes.StringDType(na_object=None))); gc.collect() | |
| bytes_ = memory_usage(arr); gc.collect() | |
| arr.tolist(); gc.collect() | |
| del arr; gc.collect() | |
| return bytes_ | |
| @profile(backend=backend, stream=file) | |
| def profile_numpy_str_mem() -> int: | |
| arr = pd.arrays.NumpyExtensionArray(mk_raw(np.dtypes.StringDType())); gc.collect() | |
| bytes_ = memory_usage(arr); gc.collect() | |
| arr.tolist(); gc.collect() | |
| del arr; gc.collect() | |
| return bytes_ | |
| @profile(backend=backend, stream=file) | |
| def profile_python_mem() -> int: | |
| arr = pd.array(mk_raw(), dtype=pd.StringDtype("python")); gc.collect() | |
| bytes_ = memory_usage(arr); gc.collect() | |
| arr.tolist(); gc.collect() | |
| del arr; gc.collect() | |
| return bytes_ | |
| @profile(backend=backend, stream=file) | |
| def profile_arrow_mem() -> int: | |
| arr = pd.array(mk_raw(), dtype=pd.StringDtype("pyarrow")); gc.collect() | |
| bytes_ = memory_usage(arr); gc.collect() | |
| arr.tolist(); gc.collect() | |
| del arr; gc.collect() | |
| return bytes_ | |
| funcs = [ | |
| profile_numpy_unicode_mem, | |
| profile_numpy_obj_mem, | |
| profile_numpy_str_mem_na, | |
| profile_numpy_str_mem, | |
| profile_python_mem, | |
| profile_arrow_mem, | |
| ] | |
| if __name__ == "__main__": | |
| if len(sys.argv) == 2: | |
| f = funcs[int(sys.argv[1])] | |
| bytes_ = f() | |
| out = "\n".join(file.getvalue().splitlines()[2:-3]) | |
| print(f"{f.__name__} (Self-reported: {bytes_ / 1e6:.2f} MB):") | |
| print(f"Profiled:\n{out}\n") | |
| sys.exit(0) | |
| for i in range(len(funcs)): | |
| run([sys.executable, sys.argv[0], str(i)]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [envs.default] | |
| detached = true | |
| installer = "uv" | |
| python = "3.13" | |
| dependencies = [ "scanpy>=1.12rc0", "pyarrow", "memory-profiler" ] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment