TheLurps · April 1, 2025 09:36
diff --git a/duckdb_sampling.ipynb b/duckdb_sampling.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "14e831ab",
   "metadata": {
    "jupyter": {
     "source_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "# /// script\n",
    "# requires-python = \">=3.12\"\n",
    "# dependencies = [\n",
    "#     \"duckdb\",\n",
    "#     \"numpy\",\n",
    "#     \"pandas\",\n",
    "#     \"pyarrow\",\n",
    "# ]\n",
    "#\n",
    "# [tool.uv]\n",
    "# exclude-newer = \"2025-04-01T09:16:45.240871707+02:00\"\n",
    "# ///"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b407c434",
   "metadata": {},
   "source": [
    "# Sampling with conn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "7a226178",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import duckdb\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tempfile import TemporaryDirectory, NamedTemporaryFile"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "cb90c012",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Python version: 3.12.9 (main, Feb 12 2025, 14:50:50) [Clang 19.1.6 ]\n",
      "DuckDB version: 1.2.1\n",
      "NumPy version: 2.2.4\n",
      "Pandas version: 2.2.3\n"
     ]
    }
   ],
   "source": [
    "print(\"Python version:\", os.sys.version)\n",
    "print(\"DuckDB version:\", duckdb.__version__)\n",
    "print(\"NumPy version:\", np.__version__)\n",
    "print(\"Pandas version:\", pd.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "d253c790",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<duckdb.duckdb.DuckDBPyConnection at 0x7f2fb83da2f0>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# required to ensure repeatable results\n",
    "conn = duckdb.connect(\":memory:\")\n",
    "conn.execute(\"SET threads = 1;\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c30a8ed0",
   "metadata": {},
   "source": [
    "## Generate dummy data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "94b7fb47",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(42)\n",
    "size = 1_000_000\n",
    "df = pd.DataFrame({\n",
    "    'range': range(size),\n",
    "    'bin': np.random.randint(0, 10, size=size),\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "6567c67e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1000000 entries, 0 to 999999\n",
      "Data columns (total 2 columns):\n",
      " #   Column  Non-Null Count    Dtype\n",
      "---  ------  --------------    -----\n",
      " 0   range   1000000 non-null  int64\n",
      " 1   bin     1000000 non-null  int64\n",
      "dtypes: int64(2)\n",
      "memory usage: 15.3 MB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "4f053a1f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>range</th>\n",
       "      <th>bin</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   range  bin\n",
       "0      0    6\n",
       "1      1    3\n",
       "2      2    7\n",
       "3      3    4\n",
       "4      4    6"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eadd746b",
   "metadata": {},
   "source": [
    "## Check if sample is repeatable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "206096fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# result from previous run\n",
    "sample10 = [(32579,),\n",
    "            (20203,),\n",
    "            (17919,),\n",
    "            (23196,),\n",
    "            (25754,),\n",
    "            (38699,),\n",
    "            (35653,),\n",
    "            (28856,),\n",
    "            (33818,),\n",
    "            (40181,)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "dc110516",
   "metadata": {},
   "outputs": [],
   "source": [
    "def sample10_mem(df):\n",
    "    return conn.sql(\"\"\"\n",
    "        SELECT range\n",
    "        FROM df\n",
    "        USING SAMPLE reservoir(10 ROWS)\n",
    "        REPEATABLE(42);\n",
    "        \"\"\").fetchall()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "7b1e5796",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(32579,),\n",
       " (20203,),\n",
       " (17919,),\n",
       " (23196,),\n",
       " (25754,),\n",
       " (38699,),\n",
       " (35653,),\n",
       " (28856,),\n",
       " (33818,),\n",
       " (40181,)]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample10_mem_result = sample10_mem(df)\n",
    "sample10_mem_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "8d2b9acc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample10_mem_repeatable = sample10_mem_result == sample10\n",
    "sample10_mem_repeatable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "1e6d91be",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "435 μs ± 4.31 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "sample10_mem_time = %timeit -o sample10_mem(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "80b5ccc2",
   "metadata": {},
   "source": [
    "## Sample from parquet file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "d218f3f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp = NamedTemporaryFile(suffix='.parquet')\n",
    "parquet_file = tmp.name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "846f3e66",
   "metadata": {},
   "outputs": [],
   "source": [
    "conn.sql(f\"\"\"\n",
    "COPY (FROM df)\n",
    "TO '{parquet_file}'\n",
    "(FORMAT 'parquet', OVERWRITE);\n",
    "\"\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "0c4d1180",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4,8M\t/tmp/tmp51y3qyad.parquet\n"
     ]
    }
   ],
   "source": [
    "!du -sh $parquet_file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "3fa5e83f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def sample10_parquet(parquet_file):\n",
    "    return conn.sql(f\"\"\"\n",
    "        SELECT range\n",
    "        FROM '{parquet_file}'\n",
    "        USING SAMPLE reservoir(10 ROWS)\n",
    "        REPEATABLE(42);\n",
    "        \"\"\").fetchall()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "a020c3c1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(32579,),\n",
       " (20203,),\n",
       " (17919,),\n",
       " (23196,),\n",
       " (25754,),\n",
       " (38699,),\n",
       " (35653,),\n",
       " (28856,),\n",
       " (33818,),\n",
       " (40181,)]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample10_parquet_result = sample10_parquet(parquet_file)\n",
    "sample10_parquet_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "9e456f41",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample10_parquet_repeatable = sample10_parquet_result == sample10\n",
    "sample10_parquet_repeatable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "8dc382eb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6.44 ms ± 23.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "sample10_parquet_time = %timeit -o sample10_parquet(parquet_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "cadf3c51",
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b024ca75",
   "metadata": {},
   "source": [
    "## Sample from hive partitioned parquet files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "cca724ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp = TemporaryDirectory()\n",
    "hive_path = tmp.name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "6a58be66",
   "metadata": {},
   "outputs": [],
   "source": [
    "conn.sql(f\"\"\"\n",
    "COPY (FROM df)\n",
    "TO '{hive_path}'\n",
    "(FORMAT 'parquet', PARTITION_BY bin, OVERWRITE);\n",
    "\"\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "fcbc526b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[01;34m/tmp/tmpyrj8v5be\u001b[0m\n",
      "├── \u001b[01;34mbin=0\u001b[0m\n",
      "│   └── data_0.parquet\n",
      "├── \u001b[01;34mbin=1\u001b[0m\n",
      "│   └── data_0.parquet\n",
      "├── \u001b[01;34mbin=2\u001b[0m\n",
      "│   └── data_0.parquet\n",
      "├── \u001b[01;34mbin=3\u001b[0m\n",
      "│   └── data_0.parquet\n",
      "├── \u001b[01;34mbin=4\u001b[0m\n",
      "│   └── data_0.parquet\n",
      "├── \u001b[01;34mbin=5\u001b[0m\n",
      "│   └── data_0.parquet\n",
      "├── \u001b[01;34mbin=6\u001b[0m\n",
      "│   └── data_0.parquet\n",
      "├── \u001b[01;34mbin=7\u001b[0m\n",
      "│   └── data_0.parquet\n",
      "├── \u001b[01;34mbin=8\u001b[0m\n",
      "│   └── data_0.parquet\n",
      "└── \u001b[01;34mbin=9\u001b[0m\n",
      "    └── data_0.parquet\n",
      "\n",
      "11 directories, 10 files\n"
     ]
    }
   ],
   "source": [
    "!tree $hive_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "ae591e33",
   "metadata": {},
   "outputs": [],
   "source": [
    "def sample10_hive(hive_path):\n",
    "    return conn.sql(f\"\"\"\n",
    "        SELECT range\n",
    "        FROM read_parquet('{hive_path}/*/*.parquet', hive_partitioning = true)\n",
    "        USING SAMPLE reservoir(10 ROWS)\n",
    "        REPEATABLE(42);\n",
    "        \"\"\").fetchall()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "7321c7c1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(325557,),\n",
       " (201776,),\n",
       " (178697,),\n",
       " (231953,),\n",
       " (257193,),\n",
       " (386242,),\n",
       " (356196,),\n",
       " (287870,),\n",
       " (337526,),\n",
       " (401396,)]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample10_hive_result = sample10_hive(hive_path)\n",
    "sample10_hive_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "6230109f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample10_hive_repeatable = sample10_hive_result == [(325557,),\n",
    "                                                    (201776,),\n",
    "                                                    (178697,),\n",
    "                                                    (231953,),\n",
    "                                                    (257193,),\n",
    "                                                    (386242,),\n",
    "                                                    (356196,),\n",
    "                                                    (287870,),\n",
    "                                                    (337526,),\n",
    "                                                    (401396,)]\n",
    "sample10_hive_repeatable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "52a74e2b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6.75 ms ± 28.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "sample10_hive_time =  %timeit -o sample10_hive(hive_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "be742123",
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp.cleanup()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b11c5371",
   "metadata": {},
   "source": [
    "## Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "b34043b7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>average</th>\n",
       "      <th>std</th>\n",
       "      <th>best</th>\n",
       "      <th>repeatable</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>memory</th>\n",
       "      <td>0.000435</td>\n",
       "      <td>0.000004</td>\n",
       "      <td>0.000432</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>parquet</th>\n",
       "      <td>0.006443</td>\n",
       "      <td>0.000023</td>\n",
       "      <td>0.006420</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hive</th>\n",
       "      <td>0.006749</td>\n",
       "      <td>0.000029</td>\n",
       "      <td>0.006690</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          average       std      best  repeatable\n",
       "memory   0.000435  0.000004  0.000432        True\n",
       "parquet  0.006443  0.000023  0.006420        True\n",
       "hive     0.006749  0.000029  0.006690        True"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame({\n",
    "    'average': [sample10_mem_time.average,\n",
    "                sample10_parquet_time.average,\n",
    "                sample10_hive_time.average],\n",
    "    'std': [sample10_mem_time.stdev,\n",
    "            sample10_parquet_time.stdev,\n",
    "            sample10_hive_time.stdev],\n",
    "    'best': [sample10_mem_time.best,\n",
    "             sample10_parquet_time.best,\n",
    "             sample10_hive_time.best],\n",
    "    'repeatable': [sample10_mem_repeatable,\n",
    "                   sample10_parquet_repeatable,\n",
    "                   sample10_hive_repeatable],\n",
    "}, index=['memory', 'parquet', 'hive'])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
diff --git a/duckdb_sampling.py b/duckdb_sampling.py
 import duckdb
 import numpy as np
 import pandas as pd
 from tempfile import TemporaryDirectory, NamedTemporaryFile

 conn = duckdb.connect(":memory:")
 conn.execute("SET threads = 1;")

 # generate data
 np.random.seed(42)
 size = 1_000_000
 df = pd.DataFrame(
    {
        "range": range(size),
        "bin": np.random.randint(0, 10, size=size),
    }
 )

 print("sampling from df")
 print(
    conn.sql("""
  SELECT range
  FROM df
  USING SAMPLE reservoir(10 ROWS)
  REPEATABLE(42);
  """).fetchall()
 )

 # write to single parquet
 parquet_file = NamedTemporaryFile(suffix=".parquet")
 conn.sql(f"""
  COPY (FROM df)
  TO '{parquet_file.name}'
  (FORMAT 'parquet', OVERWRITE);
 """)

 print("sampling from single parquet")
 print(
    conn.sql(f"""
  SELECT range
  FROM '{parquet_file.name}'
  USING SAMPLE reservoir(10 ROWS)
  REPEATABLE(42);
  """).fetchall()
 )

 parquet_file.close()

 # write a hive partitioned parquet files
 hive_path = TemporaryDirectory()
 conn.sql(f"""
  COPY (FROM df)
  TO '{hive_path.name}'
  (FORMAT 'parquet', PARTITION_BY bin, OVERWRITE);
  """)

 print("sampling from hive partitioned parquet files")
 print(
    conn.sql(f"""
  SELECT range
  FROM read_parquet('{hive_path.name}/*/*.parquet', hive_partitioning = true)
  USING SAMPLE reservoir(10 ROWS)
  REPEATABLE(42);
  """).fetchall()
 )

 hive_path.cleanup()
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "14e831ab",
	"metadata": {
	"jupyter": {
	"source_hidden": true
	}
	},
	"outputs": [],
	"source": [
	"# /// script\n",
	"# requires-python = \">=3.12\"\n",
	"# dependencies = [\n",
	"# \"duckdb\",\n",
	"# \"numpy\",\n",
	"# \"pandas\",\n",
	"# \"pyarrow\",\n",
	"# ]\n",
	"#\n",
	"# [tool.uv]\n",
	"# exclude-newer = \"2025-04-01T09:16:45.240871707+02:00\"\n",
	"# ///"
	]
	},
	{
	"cell_type": "markdown",
	"id": "b407c434",
	"metadata": {},
	"source": [
	"# Sampling with conn"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "7a226178",
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"import duckdb\n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"from tempfile import TemporaryDirectory, NamedTemporaryFile"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "cb90c012",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Python version: 3.12.9 (main, Feb 12 2025, 14:50:50) [Clang 19.1.6 ]\n",
	"DuckDB version: 1.2.1\n",
	"NumPy version: 2.2.4\n",
	"Pandas version: 2.2.3\n"
	]
	}
	],
	"source": [
	"print(\"Python version:\", os.sys.version)\n",
	"print(\"DuckDB version:\", duckdb.__version__)\n",
	"print(\"NumPy version:\", np.__version__)\n",
	"print(\"Pandas version:\", pd.__version__)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "d253c790",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"<duckdb.duckdb.DuckDBPyConnection at 0x7f2fb83da2f0>"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# required to ensure repeatable results\n",
	"conn = duckdb.connect(\":memory:\")\n",
	"conn.execute(\"SET threads = 1;\")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "c30a8ed0",
	"metadata": {},
	"source": [
	"## Generate dummy data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "94b7fb47",
	"metadata": {},
	"outputs": [],
	"source": [
	"np.random.seed(42)\n",
	"size = 1_000_000\n",
	"df = pd.DataFrame({\n",
	" 'range': range(size),\n",
	" 'bin': np.random.randint(0, 10, size=size),\n",
	"})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "6567c67e",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"<class 'pandas.core.frame.DataFrame'>\n",
	"RangeIndex: 1000000 entries, 0 to 999999\n",
	"Data columns (total 2 columns):\n",
	" # Column Non-Null Count Dtype\n",
	"--- ------ -------------- -----\n",
	" 0 range 1000000 non-null int64\n",
	" 1 bin 1000000 non-null int64\n",
	"dtypes: int64(2)\n",
	"memory usage: 15.3 MB\n"
	]
	}
	],
	"source": [
	"df.info()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "4f053a1f",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>range</th>\n",
	" <th>bin</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>0</td>\n",
	" <td>6</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>1</td>\n",
	" <td>3</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>2</td>\n",
	" <td>7</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>3</td>\n",
	" <td>4</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>4</td>\n",
	" <td>6</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" range bin\n",
	"0 0 6\n",
	"1 1 3\n",
	"2 2 7\n",
	"3 3 4\n",
	"4 4 6"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df.head()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "eadd746b",
	"metadata": {},
	"source": [
	"## Check if sample is repeatable"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "206096fc",
	"metadata": {},
	"outputs": [],
	"source": [
	"# result from previous run\n",
	"sample10 = [(32579,),\n",
	" (20203,),\n",
	" (17919,),\n",
	" (23196,),\n",
	" (25754,),\n",
	" (38699,),\n",
	" (35653,),\n",
	" (28856,),\n",
	" (33818,),\n",
	" (40181,)]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "dc110516",
	"metadata": {},
	"outputs": [],
	"source": [
	"def sample10_mem(df):\n",
	" return conn.sql(\"\"\"\n",
	" SELECT range\n",
	" FROM df\n",
	" USING SAMPLE reservoir(10 ROWS)\n",
	" REPEATABLE(42);\n",
	" \"\"\").fetchall()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "7b1e5796",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[(32579,),\n",
	" (20203,),\n",
	" (17919,),\n",
	" (23196,),\n",
	" (25754,),\n",
	" (38699,),\n",
	" (35653,),\n",
	" (28856,),\n",
	" (33818,),\n",
	" (40181,)]"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sample10_mem_result = sample10_mem(df)\n",
	"sample10_mem_result"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"id": "8d2b9acc",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sample10_mem_repeatable = sample10_mem_result == sample10\n",
	"sample10_mem_repeatable"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"id": "1e6d91be",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"435 μs ± 4.31 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
	]
	}
	],
	"source": [
	"sample10_mem_time = %timeit -o sample10_mem(df)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "80b5ccc2",
	"metadata": {},
	"source": [
	"## Sample from parquet file"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"id": "d218f3f8",
	"metadata": {},
	"outputs": [],
	"source": [
	"tmp = NamedTemporaryFile(suffix='.parquet')\n",
	"parquet_file = tmp.name"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"id": "846f3e66",
	"metadata": {},
	"outputs": [],
	"source": [
	"conn.sql(f\"\"\"\n",
	"COPY (FROM df)\n",
	"TO '{parquet_file}'\n",
	"(FORMAT 'parquet', OVERWRITE);\n",
	"\"\"\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"id": "0c4d1180",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"4,8M\t/tmp/tmp51y3qyad.parquet\n"
	]
	}
	],
	"source": [
	"!du -sh $parquet_file"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"id": "3fa5e83f",
	"metadata": {},
	"outputs": [],
	"source": [
	"def sample10_parquet(parquet_file):\n",
	" return conn.sql(f\"\"\"\n",
	" SELECT range\n",
	" FROM '{parquet_file}'\n",
	" USING SAMPLE reservoir(10 ROWS)\n",
	" REPEATABLE(42);\n",
	" \"\"\").fetchall()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"id": "a020c3c1",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[(32579,),\n",
	" (20203,),\n",
	" (17919,),\n",
	" (23196,),\n",
	" (25754,),\n",
	" (38699,),\n",
	" (35653,),\n",
	" (28856,),\n",
	" (33818,),\n",
	" (40181,)]"
	]
	},
	"execution_count": 17,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sample10_parquet_result = sample10_parquet(parquet_file)\n",
	"sample10_parquet_result"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"id": "9e456f41",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 18,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sample10_parquet_repeatable = sample10_parquet_result == sample10\n",
	"sample10_parquet_repeatable"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"id": "8dc382eb",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"6.44 ms ± 23.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
	]
	}
	],
	"source": [
	"sample10_parquet_time = %timeit -o sample10_parquet(parquet_file)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"id": "cadf3c51",
	"metadata": {},
	"outputs": [],
	"source": [
	"tmp.close()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "b024ca75",
	"metadata": {},
	"source": [
	"## Sample from hive partitioned parquet files"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"id": "cca724ea",
	"metadata": {},
	"outputs": [],
	"source": [
	"tmp = TemporaryDirectory()\n",
	"hive_path = tmp.name"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"id": "6a58be66",
	"metadata": {},
	"outputs": [],
	"source": [
	"conn.sql(f\"\"\"\n",
	"COPY (FROM df)\n",
	"TO '{hive_path}'\n",
	"(FORMAT 'parquet', PARTITION_BY bin, OVERWRITE);\n",
	"\"\"\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"id": "fcbc526b",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\u001b[01;34m/tmp/tmpyrj8v5be\u001b[0m\n",
	"├── \u001b[01;34mbin=0\u001b[0m\n",
	"│ └── data_0.parquet\n",
	"├── \u001b[01;34mbin=1\u001b[0m\n",
	"│ └── data_0.parquet\n",
	"├── \u001b[01;34mbin=2\u001b[0m\n",
	"│ └── data_0.parquet\n",
	"├── \u001b[01;34mbin=3\u001b[0m\n",
	"│ └── data_0.parquet\n",
	"├── \u001b[01;34mbin=4\u001b[0m\n",
	"│ └── data_0.parquet\n",
	"├── \u001b[01;34mbin=5\u001b[0m\n",
	"│ └── data_0.parquet\n",
	"├── \u001b[01;34mbin=6\u001b[0m\n",
	"│ └── data_0.parquet\n",
	"├── \u001b[01;34mbin=7\u001b[0m\n",
	"│ └── data_0.parquet\n",
	"├── \u001b[01;34mbin=8\u001b[0m\n",
	"│ └── data_0.parquet\n",
	"└── \u001b[01;34mbin=9\u001b[0m\n",
	" └── data_0.parquet\n",
	"\n",
	"11 directories, 10 files\n"
	]
	}
	],
	"source": [
	"!tree $hive_path"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"id": "ae591e33",
	"metadata": {},
	"outputs": [],
	"source": [
	"def sample10_hive(hive_path):\n",
	" return conn.sql(f\"\"\"\n",
	" SELECT range\n",
	" FROM read_parquet('{hive_path}//.parquet', hive_partitioning = true)\n",
	" USING SAMPLE reservoir(10 ROWS)\n",
	" REPEATABLE(42);\n",
	" \"\"\").fetchall()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"id": "7321c7c1",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[(325557,),\n",
	" (201776,),\n",
	" (178697,),\n",
	" (231953,),\n",
	" (257193,),\n",
	" (386242,),\n",
	" (356196,),\n",
	" (287870,),\n",
	" (337526,),\n",
	" (401396,)]"
	]
	},
	"execution_count": 25,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sample10_hive_result = sample10_hive(hive_path)\n",
	"sample10_hive_result"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"id": "6230109f",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 26,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sample10_hive_repeatable = sample10_hive_result == [(325557,),\n",
	" (201776,),\n",
	" (178697,),\n",
	" (231953,),\n",
	" (257193,),\n",
	" (386242,),\n",
	" (356196,),\n",
	" (287870,),\n",
	" (337526,),\n",
	" (401396,)]\n",
	"sample10_hive_repeatable"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"id": "52a74e2b",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"6.75 ms ± 28.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
	]
	}
	],
	"source": [
	"sample10_hive_time = %timeit -o sample10_hive(hive_path)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"id": "be742123",
	"metadata": {},
	"outputs": [],
	"source": [
	"tmp.cleanup()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "b11c5371",
	"metadata": {},
	"source": [
	"## Results"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"id": "b34043b7",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>average</th>\n",
	" <th>std</th>\n",
	" <th>best</th>\n",
	" <th>repeatable</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>memory</th>\n",
	" <td>0.000435</td>\n",
	" <td>0.000004</td>\n",
	" <td>0.000432</td>\n",
	" <td>True</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>parquet</th>\n",
	" <td>0.006443</td>\n",
	" <td>0.000023</td>\n",
	" <td>0.006420</td>\n",
	" <td>True</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>hive</th>\n",
	" <td>0.006749</td>\n",
	" <td>0.000029</td>\n",
	" <td>0.006690</td>\n",
	" <td>True</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" average std best repeatable\n",
	"memory 0.000435 0.000004 0.000432 True\n",
	"parquet 0.006443 0.000023 0.006420 True\n",
	"hive 0.006749 0.000029 0.006690 True"
	]
	},
	"execution_count": 29,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pd.DataFrame({\n",
	" 'average': [sample10_mem_time.average,\n",
	" sample10_parquet_time.average,\n",
	" sample10_hive_time.average],\n",
	" 'std': [sample10_mem_time.stdev,\n",
	" sample10_parquet_time.stdev,\n",
	" sample10_hive_time.stdev],\n",
	" 'best': [sample10_mem_time.best,\n",
	" sample10_parquet_time.best,\n",
	" sample10_hive_time.best],\n",
	" 'repeatable': [sample10_mem_repeatable,\n",
	" sample10_parquet_repeatable,\n",
	" sample10_hive_repeatable],\n",
	"}, index=['memory', 'parquet', 'hive'])"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.12.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
	import duckdb
	import numpy as np
	import pandas as pd
	from tempfile import TemporaryDirectory, NamedTemporaryFile

	conn = duckdb.connect(":memory:")
	conn.execute("SET threads = 1;")

	# generate data
	np.random.seed(42)
	size = 1_000_000
	df = pd.DataFrame(
	{
	"range": range(size),
	"bin": np.random.randint(0, 10, size=size),
	}
	)

	print("sampling from df")
	print(
	conn.sql("""
	SELECT range
	FROM df
	USING SAMPLE reservoir(10 ROWS)
	REPEATABLE(42);
	""").fetchall()
	)

	# write to single parquet
	parquet_file = NamedTemporaryFile(suffix=".parquet")
	conn.sql(f"""
	COPY (FROM df)
	TO '{parquet_file.name}'
	(FORMAT 'parquet', OVERWRITE);
	""")

	print("sampling from single parquet")
	print(
	conn.sql(f"""
	SELECT range
	FROM '{parquet_file.name}'
	USING SAMPLE reservoir(10 ROWS)
	REPEATABLE(42);
	""").fetchall()
	)

	parquet_file.close()

	# write a hive partitioned parquet files
	hive_path = TemporaryDirectory()
	conn.sql(f"""
	COPY (FROM df)
	TO '{hive_path.name}'
	(FORMAT 'parquet', PARTITION_BY bin, OVERWRITE);
	""")

	print("sampling from hive partitioned parquet files")
	print(
	conn.sql(f"""
	SELECT range
	FROM read_parquet('{hive_path.name}//.parquet', hive_partitioning = true)
	USING SAMPLE reservoir(10 ROWS)
	REPEATABLE(42);
	""").fetchall()
	)

	hive_path.cleanup()