Skip to content

Instantly share code, notes, and snippets.

@TheLurps
Last active April 1, 2025 09:36
Show Gist options
  • Select an option

  • Save TheLurps/ae704899fdb05599c9b4f6f0fd8fce03 to your computer and use it in GitHub Desktop.

Select an option

Save TheLurps/ae704899fdb05599c9b4f6f0fd8fce03 to your computer and use it in GitHub Desktop.
DuckDB sampling from parquet and hive partitioned parquet files
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "14e831ab",
"metadata": {
"jupyter": {
"source_hidden": true
}
},
"outputs": [],
"source": [
"# /// script\n",
"# requires-python = \">=3.12\"\n",
"# dependencies = [\n",
"# \"duckdb\",\n",
"# \"numpy\",\n",
"# \"pandas\",\n",
"# \"pyarrow\",\n",
"# ]\n",
"#\n",
"# [tool.uv]\n",
"# exclude-newer = \"2025-04-01T09:16:45.240871707+02:00\"\n",
"# ///"
]
},
{
"cell_type": "markdown",
"id": "b407c434",
"metadata": {},
"source": [
"# Sampling with conn"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "7a226178",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import duckdb\n",
"import numpy as np\n",
"import pandas as pd\n",
"from tempfile import TemporaryDirectory, NamedTemporaryFile"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "cb90c012",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Python version: 3.12.9 (main, Feb 12 2025, 14:50:50) [Clang 19.1.6 ]\n",
"DuckDB version: 1.2.1\n",
"NumPy version: 2.2.4\n",
"Pandas version: 2.2.3\n"
]
}
],
"source": [
"print(\"Python version:\", os.sys.version)\n",
"print(\"DuckDB version:\", duckdb.__version__)\n",
"print(\"NumPy version:\", np.__version__)\n",
"print(\"Pandas version:\", pd.__version__)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d253c790",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2fb83da2f0>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# required to ensure repeatable results\n",
"conn = duckdb.connect(\":memory:\")\n",
"conn.execute(\"SET threads = 1;\")"
]
},
{
"cell_type": "markdown",
"id": "c30a8ed0",
"metadata": {},
"source": [
"## Generate dummy data"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "94b7fb47",
"metadata": {},
"outputs": [],
"source": [
"np.random.seed(42)\n",
"size = 1_000_000\n",
"df = pd.DataFrame({\n",
" 'range': range(size),\n",
" 'bin': np.random.randint(0, 10, size=size),\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "6567c67e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1000000 entries, 0 to 999999\n",
"Data columns (total 2 columns):\n",
" # Column Non-Null Count Dtype\n",
"--- ------ -------------- -----\n",
" 0 range 1000000 non-null int64\n",
" 1 bin 1000000 non-null int64\n",
"dtypes: int64(2)\n",
"memory usage: 15.3 MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "4f053a1f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>range</th>\n",
" <th>bin</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" range bin\n",
"0 0 6\n",
"1 1 3\n",
"2 2 7\n",
"3 3 4\n",
"4 4 6"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"id": "eadd746b",
"metadata": {},
"source": [
"## Check if sample is repeatable"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "206096fc",
"metadata": {},
"outputs": [],
"source": [
"# result from previous run\n",
"sample10 = [(32579,),\n",
" (20203,),\n",
" (17919,),\n",
" (23196,),\n",
" (25754,),\n",
" (38699,),\n",
" (35653,),\n",
" (28856,),\n",
" (33818,),\n",
" (40181,)]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "dc110516",
"metadata": {},
"outputs": [],
"source": [
"def sample10_mem(df):\n",
" return conn.sql(\"\"\"\n",
" SELECT range\n",
" FROM df\n",
" USING SAMPLE reservoir(10 ROWS)\n",
" REPEATABLE(42);\n",
" \"\"\").fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7b1e5796",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(32579,),\n",
" (20203,),\n",
" (17919,),\n",
" (23196,),\n",
" (25754,),\n",
" (38699,),\n",
" (35653,),\n",
" (28856,),\n",
" (33818,),\n",
" (40181,)]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample10_mem_result = sample10_mem(df)\n",
"sample10_mem_result"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "8d2b9acc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample10_mem_repeatable = sample10_mem_result == sample10\n",
"sample10_mem_repeatable"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "1e6d91be",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"435 μs ± 4.31 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
]
}
],
"source": [
"sample10_mem_time = %timeit -o sample10_mem(df)"
]
},
{
"cell_type": "markdown",
"id": "80b5ccc2",
"metadata": {},
"source": [
"## Sample from parquet file"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "d218f3f8",
"metadata": {},
"outputs": [],
"source": [
"tmp = NamedTemporaryFile(suffix='.parquet')\n",
"parquet_file = tmp.name"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "846f3e66",
"metadata": {},
"outputs": [],
"source": [
"conn.sql(f\"\"\"\n",
"COPY (FROM df)\n",
"TO '{parquet_file}'\n",
"(FORMAT 'parquet', OVERWRITE);\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "0c4d1180",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4,8M\t/tmp/tmp51y3qyad.parquet\n"
]
}
],
"source": [
"!du -sh $parquet_file"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "3fa5e83f",
"metadata": {},
"outputs": [],
"source": [
"def sample10_parquet(parquet_file):\n",
" return conn.sql(f\"\"\"\n",
" SELECT range\n",
" FROM '{parquet_file}'\n",
" USING SAMPLE reservoir(10 ROWS)\n",
" REPEATABLE(42);\n",
" \"\"\").fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "a020c3c1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(32579,),\n",
" (20203,),\n",
" (17919,),\n",
" (23196,),\n",
" (25754,),\n",
" (38699,),\n",
" (35653,),\n",
" (28856,),\n",
" (33818,),\n",
" (40181,)]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample10_parquet_result = sample10_parquet(parquet_file)\n",
"sample10_parquet_result"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "9e456f41",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample10_parquet_repeatable = sample10_parquet_result == sample10\n",
"sample10_parquet_repeatable"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "8dc382eb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"6.44 ms ± 23.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"sample10_parquet_time = %timeit -o sample10_parquet(parquet_file)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "cadf3c51",
"metadata": {},
"outputs": [],
"source": [
"tmp.close()"
]
},
{
"cell_type": "markdown",
"id": "b024ca75",
"metadata": {},
"source": [
"## Sample from hive partitioned parquet files"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "cca724ea",
"metadata": {},
"outputs": [],
"source": [
"tmp = TemporaryDirectory()\n",
"hive_path = tmp.name"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "6a58be66",
"metadata": {},
"outputs": [],
"source": [
"conn.sql(f\"\"\"\n",
"COPY (FROM df)\n",
"TO '{hive_path}'\n",
"(FORMAT 'parquet', PARTITION_BY bin, OVERWRITE);\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "fcbc526b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[01;34m/tmp/tmpyrj8v5be\u001b[0m\n",
"├── \u001b[01;34mbin=0\u001b[0m\n",
"│   └── data_0.parquet\n",
"├── \u001b[01;34mbin=1\u001b[0m\n",
"│   └── data_0.parquet\n",
"├── \u001b[01;34mbin=2\u001b[0m\n",
"│   └── data_0.parquet\n",
"├── \u001b[01;34mbin=3\u001b[0m\n",
"│   └── data_0.parquet\n",
"├── \u001b[01;34mbin=4\u001b[0m\n",
"│   └── data_0.parquet\n",
"├── \u001b[01;34mbin=5\u001b[0m\n",
"│   └── data_0.parquet\n",
"├── \u001b[01;34mbin=6\u001b[0m\n",
"│   └── data_0.parquet\n",
"├── \u001b[01;34mbin=7\u001b[0m\n",
"│   └── data_0.parquet\n",
"├── \u001b[01;34mbin=8\u001b[0m\n",
"│   └── data_0.parquet\n",
"└── \u001b[01;34mbin=9\u001b[0m\n",
" └── data_0.parquet\n",
"\n",
"11 directories, 10 files\n"
]
}
],
"source": [
"!tree $hive_path"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "ae591e33",
"metadata": {},
"outputs": [],
"source": [
"def sample10_hive(hive_path):\n",
" return conn.sql(f\"\"\"\n",
" SELECT range\n",
" FROM read_parquet('{hive_path}/*/*.parquet', hive_partitioning = true)\n",
" USING SAMPLE reservoir(10 ROWS)\n",
" REPEATABLE(42);\n",
" \"\"\").fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "7321c7c1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(325557,),\n",
" (201776,),\n",
" (178697,),\n",
" (231953,),\n",
" (257193,),\n",
" (386242,),\n",
" (356196,),\n",
" (287870,),\n",
" (337526,),\n",
" (401396,)]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample10_hive_result = sample10_hive(hive_path)\n",
"sample10_hive_result"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "6230109f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample10_hive_repeatable = sample10_hive_result == [(325557,),\n",
" (201776,),\n",
" (178697,),\n",
" (231953,),\n",
" (257193,),\n",
" (386242,),\n",
" (356196,),\n",
" (287870,),\n",
" (337526,),\n",
" (401396,)]\n",
"sample10_hive_repeatable"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "52a74e2b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"6.75 ms ± 28.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"sample10_hive_time = %timeit -o sample10_hive(hive_path)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "be742123",
"metadata": {},
"outputs": [],
"source": [
"tmp.cleanup()"
]
},
{
"cell_type": "markdown",
"id": "b11c5371",
"metadata": {},
"source": [
"## Results"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "b34043b7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>average</th>\n",
" <th>std</th>\n",
" <th>best</th>\n",
" <th>repeatable</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>memory</th>\n",
" <td>0.000435</td>\n",
" <td>0.000004</td>\n",
" <td>0.000432</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>parquet</th>\n",
" <td>0.006443</td>\n",
" <td>0.000023</td>\n",
" <td>0.006420</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>hive</th>\n",
" <td>0.006749</td>\n",
" <td>0.000029</td>\n",
" <td>0.006690</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" average std best repeatable\n",
"memory 0.000435 0.000004 0.000432 True\n",
"parquet 0.006443 0.000023 0.006420 True\n",
"hive 0.006749 0.000029 0.006690 True"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame({\n",
" 'average': [sample10_mem_time.average,\n",
" sample10_parquet_time.average,\n",
" sample10_hive_time.average],\n",
" 'std': [sample10_mem_time.stdev,\n",
" sample10_parquet_time.stdev,\n",
" sample10_hive_time.stdev],\n",
" 'best': [sample10_mem_time.best,\n",
" sample10_parquet_time.best,\n",
" sample10_hive_time.best],\n",
" 'repeatable': [sample10_mem_repeatable,\n",
" sample10_parquet_repeatable,\n",
" sample10_hive_repeatable],\n",
"}, index=['memory', 'parquet', 'hive'])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
import duckdb
import numpy as np
import pandas as pd
from tempfile import TemporaryDirectory, NamedTemporaryFile
conn = duckdb.connect(":memory:")
conn.execute("SET threads = 1;")
# generate data
np.random.seed(42)
size = 1_000_000
df = pd.DataFrame(
{
"range": range(size),
"bin": np.random.randint(0, 10, size=size),
}
)
print("sampling from df")
print(
conn.sql("""
SELECT range
FROM df
USING SAMPLE reservoir(10 ROWS)
REPEATABLE(42);
""").fetchall()
)
# write to single parquet
parquet_file = NamedTemporaryFile(suffix=".parquet")
conn.sql(f"""
COPY (FROM df)
TO '{parquet_file.name}'
(FORMAT 'parquet', OVERWRITE);
""")
print("sampling from single parquet")
print(
conn.sql(f"""
SELECT range
FROM '{parquet_file.name}'
USING SAMPLE reservoir(10 ROWS)
REPEATABLE(42);
""").fetchall()
)
parquet_file.close()
# write a hive partitioned parquet files
hive_path = TemporaryDirectory()
conn.sql(f"""
COPY (FROM df)
TO '{hive_path.name}'
(FORMAT 'parquet', PARTITION_BY bin, OVERWRITE);
""")
print("sampling from hive partitioned parquet files")
print(
conn.sql(f"""
SELECT range
FROM read_parquet('{hive_path.name}/*/*.parquet', hive_partitioning = true)
USING SAMPLE reservoir(10 ROWS)
REPEATABLE(42);
""").fetchall()
)
hive_path.cleanup()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment