From edf403ea82d7ade2f7045970b27bffcb9bd42fa5 Mon Sep 17 00:00:00 2001
From: Matej Mojzes <you@example.com>
Date: Tue, 26 Nov 2024 08:59:08 +0100
Subject: [PATCH] cleansed/updated version

---
 gcp_data_processing/pyspark_demo.ipynb | 54 +++++++++-----------------
 1 file changed, 18 insertions(+), 36 deletions(-)

diff --git a/gcp_data_processing/pyspark_demo.ipynb b/gcp_data_processing/pyspark_demo.ipynb
index 41f61f2..43dc860 100644
--- a/gcp_data_processing/pyspark_demo.ipynb
+++ b/gcp_data_processing/pyspark_demo.ipynb
@@ -3,35 +3,19 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "c8ba9c17",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = [\n",
-    "    (1, \"Alice\", 30, \"HR\"),\n",
-    "    (2, \"Bob\", 25, \"Engineering\"),\n",
-    "    (3, \"Catherine\", 35, \"Finance\"),\n",
-    "    (4, \"David\", 29, \"Engineering\"),\n",
-    "    (5, \"Eve\", 40, \"HR\"),\n",
-    "]\n",
-    "columns = [\"ID\", \"Name\", \"Age\", \"Department\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "4362bfc7",
+   "id": "a8a16b7e",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "24/11/25 14:49:29 WARN Utils: Your hostname, Matejs-Lopata.local resolves to a loopback address: 127.0.0.1; using 10.16.6.34 instead (on interface en0)\n",
-      "24/11/25 14:49:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
+      "24/11/26 08:52:09 WARN Utils: Your hostname, Matejs-Lopata.local resolves to a loopback address: 127.0.0.1; using 192.168.1.113 instead (on interface en0)\n",
+      "24/11/26 08:52:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
       "Setting default log level to \"WARN\".\n",
       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
-      "24/11/25 14:49:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
+      "24/11/26 08:52:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
+      "[Stage 0:>                                                          (0 + 8) / 8]\r"
      ]
     },
     {
@@ -45,8 +29,6 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r",
-      "[Stage 0:>                                                          (0 + 8) / 8]\r",
       "\r",
       "                                                                                \r"
      ]
@@ -73,14 +55,14 @@
     "\n",
     "df = spark.createDataFrame(data, columns)\n",
     "\n",
-    "# Let's check if we have the correct number of rows (5)\n",
+    "# Let's check if we have the correct number of rows (5), this will trigger execution of the DAG\n",
     "print(df.count())"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "id": "8839b65c",
+   "execution_count": 2,
+   "id": "fabe6643",
    "metadata": {},
    "outputs": [
     {
@@ -103,7 +85,7 @@
     "grouped = df.groupBy(\"Department\")\n",
     "\n",
     "# Step 4: Calculate Average Age\n",
-    "result = grouped.avg(\"Age\").alias(\"Avg_Age\")\n",
+    "result = grouped.avg(\"Age\")#.alias(\"Avg_Age\")\n",
     "\n",
     "# Step 5: Rename the column and order by Avg_Age\n",
     "result = result.withColumnRenamed(\"avg(Age)\", \"Avg_Age\") \\\n",
@@ -115,8 +97,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "id": "734cedc9",
+   "execution_count": 3,
+   "id": "14a1347b",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -131,8 +113,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "id": "f0133e23",
+   "execution_count": 4,
+   "id": "e43658aa",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -147,8 +129,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "6096a4a0",
+   "execution_count": 5,
+   "id": "0fbd823f",
    "metadata": {},
    "outputs": [
     {
@@ -185,12 +167,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "id": "7a0ffcea",
+   "execution_count": 6,
+   "id": "b8d3cc8a",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Not really necessary in notebook, but let's stop the Spark session\n",
+    "# Not really necessary in a notebook, but let's stop the Spark session\n",
     "spark.stop()"
    ]
   }
-- 
GitLab