From edf403ea82d7ade2f7045970b27bffcb9bd42fa5 Mon Sep 17 00:00:00 2001 From: Matej Mojzes <you@example.com> Date: Tue, 26 Nov 2024 08:59:08 +0100 Subject: [PATCH] cleansed/updated version --- gcp_data_processing/pyspark_demo.ipynb | 54 +++++++++----------------- 1 file changed, 18 insertions(+), 36 deletions(-) diff --git a/gcp_data_processing/pyspark_demo.ipynb b/gcp_data_processing/pyspark_demo.ipynb index 41f61f2..43dc860 100644 --- a/gcp_data_processing/pyspark_demo.ipynb +++ b/gcp_data_processing/pyspark_demo.ipynb @@ -3,35 +3,19 @@ { "cell_type": "code", "execution_count": 1, - "id": "c8ba9c17", - "metadata": {}, - "outputs": [], - "source": [ - "data = [\n", - " (1, \"Alice\", 30, \"HR\"),\n", - " (2, \"Bob\", 25, \"Engineering\"),\n", - " (3, \"Catherine\", 35, \"Finance\"),\n", - " (4, \"David\", 29, \"Engineering\"),\n", - " (5, \"Eve\", 40, \"HR\"),\n", - "]\n", - "columns = [\"ID\", \"Name\", \"Age\", \"Department\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "4362bfc7", + "id": "a8a16b7e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "24/11/25 14:49:29 WARN Utils: Your hostname, Matejs-Lopata.local resolves to a loopback address: 127.0.0.1; using 10.16.6.34 instead (on interface en0)\n", - "24/11/25 14:49:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", + "24/11/26 08:52:09 WARN Utils: Your hostname, Matejs-Lopata.local resolves to a loopback address: 127.0.0.1; using 192.168.1.113 instead (on interface en0)\n", + "24/11/26 08:52:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", - "24/11/25 14:49:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + "24/11/26 08:52:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", + "[Stage 0:> (0 + 8) / 8]\r" ] }, { @@ -45,8 +29,6 @@ "name": "stderr", "output_type": "stream", "text": [ - "\r", - "[Stage 0:> (0 + 8) / 8]\r", "\r", " \r" ] @@ -73,14 +55,14 @@ "\n", "df = spark.createDataFrame(data, columns)\n", "\n", - "# Let's check if we have the correct number of rows (5)\n", + "# Let's check if we have the correct number of rows (5), this will trigger execution of the DAG\n", "print(df.count())" ] }, { "cell_type": "code", - "execution_count": 3, - "id": "8839b65c", + "execution_count": 2, + "id": "fabe6643", "metadata": {}, "outputs": [ { @@ -103,7 +85,7 @@ "grouped = df.groupBy(\"Department\")\n", "\n", "# Step 4: Calculate Average Age\n", - "result = grouped.avg(\"Age\").alias(\"Avg_Age\")\n", + "result = grouped.avg(\"Age\")#.alias(\"Avg_Age\")\n", "\n", "# Step 5: Rename the column and order by Avg_Age\n", "result = result.withColumnRenamed(\"avg(Age)\", \"Avg_Age\") \\\n", @@ -115,8 +97,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "734cedc9", + "execution_count": 3, + "id": "14a1347b", "metadata": {}, "outputs": [], "source": [ @@ -131,8 +113,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "f0133e23", + "execution_count": 4, + "id": "e43658aa", "metadata": {}, "outputs": [], "source": [ @@ -147,8 +129,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "6096a4a0", + "execution_count": 5, + "id": "0fbd823f", "metadata": {}, "outputs": [ { @@ -185,12 +167,12 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "7a0ffcea", + "execution_count": 6, + "id": "b8d3cc8a", "metadata": {}, "outputs": [], "source": [ - "# Not really necessary in notebook, but let's stop the Spark session\n", + "# Not really necessary in a notebook, but let's stop the Spark session\n", "spark.stop()" ] } -- GitLab