alnoda-workspaces/workspaces/notebook-workspace/tutorials/sql-pandas.ipynb
2022-06-20 18:24:28 +00:00

458 lines
12 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"source": [
"# Fugue demo\n",
"\n"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Enable Fugue SQL by loading \"fugue_notebook\" extension\n",
"%load_ext fugue_notebook"
]
},
{
"source": [
"__NOTE__ When using fugue SQL you might see warnings. Ignore them. Or run the next cell to supress warnings"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Supress warnings\n",
"import warnings\n",
"warnings.filterwarnings('ignore')"
]
},
{
"source": [
"### Load CSV file into pandas dataframe"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 4,
"id": "18e8d3df",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"df = pd.read_csv('housing.csv') "
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "17587d1b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>crim</th>\n",
" <th>zn</th>\n",
" <th>indus</th>\n",
" <th>chas</th>\n",
" <th>nox</th>\n",
" <th>rm</th>\n",
" <th>age</th>\n",
" <th>dis</th>\n",
" <th>rad</th>\n",
" <th>tax</th>\n",
" <th>ptratio</th>\n",
" <th>b</th>\n",
" <th>lstat</th>\n",
" <th>medv</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.00632</td>\n",
" <td>18.0</td>\n",
" <td>2.31</td>\n",
" <td>0</td>\n",
" <td>0.538</td>\n",
" <td>6.575</td>\n",
" <td>65.2</td>\n",
" <td>4.0900</td>\n",
" <td>1</td>\n",
" <td>296</td>\n",
" <td>15.3</td>\n",
" <td>396.90</td>\n",
" <td>4.98</td>\n",
" <td>24.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.02731</td>\n",
" <td>0.0</td>\n",
" <td>7.07</td>\n",
" <td>0</td>\n",
" <td>0.469</td>\n",
" <td>6.421</td>\n",
" <td>78.9</td>\n",
" <td>4.9671</td>\n",
" <td>2</td>\n",
" <td>242</td>\n",
" <td>17.8</td>\n",
" <td>396.90</td>\n",
" <td>9.14</td>\n",
" <td>21.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.02729</td>\n",
" <td>0.0</td>\n",
" <td>7.07</td>\n",
" <td>0</td>\n",
" <td>0.469</td>\n",
" <td>7.185</td>\n",
" <td>61.1</td>\n",
" <td>4.9671</td>\n",
" <td>2</td>\n",
" <td>242</td>\n",
" <td>17.8</td>\n",
" <td>392.83</td>\n",
" <td>4.03</td>\n",
" <td>34.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.03237</td>\n",
" <td>0.0</td>\n",
" <td>2.18</td>\n",
" <td>0</td>\n",
" <td>0.458</td>\n",
" <td>6.998</td>\n",
" <td>45.8</td>\n",
" <td>6.0622</td>\n",
" <td>3</td>\n",
" <td>222</td>\n",
" <td>18.7</td>\n",
" <td>394.63</td>\n",
" <td>2.94</td>\n",
" <td>33.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.06905</td>\n",
" <td>0.0</td>\n",
" <td>2.18</td>\n",
" <td>0</td>\n",
" <td>0.458</td>\n",
" <td>7.147</td>\n",
" <td>54.2</td>\n",
" <td>6.0622</td>\n",
" <td>3</td>\n",
" <td>222</td>\n",
" <td>18.7</td>\n",
" <td>396.90</td>\n",
" <td>5.33</td>\n",
" <td>36.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>501</th>\n",
" <td>0.06263</td>\n",
" <td>0.0</td>\n",
" <td>11.93</td>\n",
" <td>0</td>\n",
" <td>0.573</td>\n",
" <td>6.593</td>\n",
" <td>69.1</td>\n",
" <td>2.4786</td>\n",
" <td>1</td>\n",
" <td>273</td>\n",
" <td>21.0</td>\n",
" <td>391.99</td>\n",
" <td>9.67</td>\n",
" <td>22.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>502</th>\n",
" <td>0.04527</td>\n",
" <td>0.0</td>\n",
" <td>11.93</td>\n",
" <td>0</td>\n",
" <td>0.573</td>\n",
" <td>6.120</td>\n",
" <td>76.7</td>\n",
" <td>2.2875</td>\n",
" <td>1</td>\n",
" <td>273</td>\n",
" <td>21.0</td>\n",
" <td>396.90</td>\n",
" <td>9.08</td>\n",
" <td>20.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>503</th>\n",
" <td>0.06076</td>\n",
" <td>0.0</td>\n",
" <td>11.93</td>\n",
" <td>0</td>\n",
" <td>0.573</td>\n",
" <td>6.976</td>\n",
" <td>91.0</td>\n",
" <td>2.1675</td>\n",
" <td>1</td>\n",
" <td>273</td>\n",
" <td>21.0</td>\n",
" <td>396.90</td>\n",
" <td>5.64</td>\n",
" <td>23.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>504</th>\n",
" <td>0.10959</td>\n",
" <td>0.0</td>\n",
" <td>11.93</td>\n",
" <td>0</td>\n",
" <td>0.573</td>\n",
" <td>6.794</td>\n",
" <td>89.3</td>\n",
" <td>2.3889</td>\n",
" <td>1</td>\n",
" <td>273</td>\n",
" <td>21.0</td>\n",
" <td>393.45</td>\n",
" <td>6.48</td>\n",
" <td>22.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>505</th>\n",
" <td>0.04741</td>\n",
" <td>0.0</td>\n",
" <td>11.93</td>\n",
" <td>0</td>\n",
" <td>0.573</td>\n",
" <td>6.030</td>\n",
" <td>80.8</td>\n",
" <td>2.5050</td>\n",
" <td>1</td>\n",
" <td>273</td>\n",
" <td>21.0</td>\n",
" <td>396.90</td>\n",
" <td>7.88</td>\n",
" <td>11.9</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>506 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" crim zn indus chas nox rm age dis rad tax \\\n",
"0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 \n",
"1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 \n",
"2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 \n",
"3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 \n",
"4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 \n",
".. ... ... ... ... ... ... ... ... ... ... \n",
"501 0.06263 0.0 11.93 0 0.573 6.593 69.1 2.4786 1 273 \n",
"502 0.04527 0.0 11.93 0 0.573 6.120 76.7 2.2875 1 273 \n",
"503 0.06076 0.0 11.93 0 0.573 6.976 91.0 2.1675 1 273 \n",
"504 0.10959 0.0 11.93 0 0.573 6.794 89.3 2.3889 1 273 \n",
"505 0.04741 0.0 11.93 0 0.573 6.030 80.8 2.5050 1 273 \n",
"\n",
" ptratio b lstat medv \n",
"0 15.3 396.90 4.98 24.0 \n",
"1 17.8 396.90 9.14 21.6 \n",
"2 17.8 392.83 4.03 34.7 \n",
"3 18.7 394.63 2.94 33.4 \n",
"4 18.7 396.90 5.33 36.2 \n",
".. ... ... ... ... \n",
"501 21.0 391.99 9.67 22.4 \n",
"502 21.0 396.90 9.08 20.6 \n",
"503 21.0 396.90 5.64 23.9 \n",
"504 21.0 393.45 6.48 22.0 \n",
"505 21.0 396.90 7.88 11.9 \n",
"\n",
"[506 rows x 14 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Display pandas dataframe (standard visualization)\n",
"df"
]
},
{
"source": [
"### Query pandas dataframe with Fugue SQL\n",
"Query and display the output"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 6,
"id": "c6c92205",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "da4dd72b33dd4879b8ad866b208d88fa",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%fsql\n",
"df2 = SELECT * FROM df LIMIT 2\n",
"PRINT"
]
},
{
"source": [
"Complex query with self-join"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"tags": []
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": "UsageError: Cell magic `%%fsql` not found.\n"
}
],
"source": [
"%%fsql\n",
"SELECT df.crim, df.zn, df.ptratio, df2.nox, df2.age, df2.tax, df2.medv \n",
"FROM df \n",
"JOIN df AS df2 ON df.age = df2.age\n",
"LIMIT 15\n",
"PRINT"
]
},
{
"source": [
"Query and save to Fugue frame"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%fsql\n",
"SELECT df.crim, df.zn, df.ptratio, df2.nox, df2.age, df2.tax, df2.medv \n",
"FROM df \n",
"JOIN df AS df2 ON df.age = df2.age\n",
"LIMIT 15\n",
"YIELD DATAFRAME AS fugue_df"
]
},
{
"source": [
"Transform Fugue frame itno pandas dataframe"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ndf = fugue_df.as_pandas()\n",
"ndf"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6-final"
}
},
"nbformat": 4,
"nbformat_minor": 5
}