diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb new file mode 100644 index 0000000..2618d38 --- /dev/null +++ b/lab-dw-data-structuring-and-combining.ipynb @@ -0,0 +1,1449 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "25d7736c-ba17-4aff-b6bb-66eba20fbf4e", + "metadata": { + "id": "25d7736c-ba17-4aff-b6bb-66eba20fbf4e" + }, + "source": [ + "# Lab | Data Structuring and Combining Data" + ] + }, + { + "cell_type": "markdown", + "id": "a2cdfc70-44c8-478c-81e7-2bc43fdf4986", + "metadata": { + "id": "a2cdfc70-44c8-478c-81e7-2bc43fdf4986" + }, + "source": [ + "## Challenge 1: Combining & Cleaning Data\n", + "\n", + "In this challenge, we will be working with the customer data from an insurance company, as we did in the two previous labs. The data can be found here:\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\n", + "\n", + "But this time, we got new data, which can be found in the following 2 CSV files located at the links below.\n", + "\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\n", + "\n", + "Note that you'll need to clean and format the new data.\n", + "\n", + "Observation:\n", + "- One option is to first combine the three datasets and then apply the cleaning function to the new combined dataset\n", + "- Another option would be to read the clean file you saved in the previous lab, and just clean the two new files and concatenate the three clean datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "492d06e3-92c7-4105-ac72-536db98d3244", + "metadata": { + "id": "492d06e3-92c7-4105-ac72-536db98d3244" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "b359f50c", + "metadata": {}, + "outputs": [], + "source": [ + "customer_data_insurance_df = pd.read_csv(\"Customer_data.csv\")\n", + "customer_data_insurance2_df = pd.read_csv(\"Customer_data2.csv\")\n", + "customer_data_insurance3_df = pd.read_csv(\"Customer_data3.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "62014030", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 4008 entries, 0 to 4007\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Customer 1071 non-null str \n", + " 1 ST 1071 non-null str \n", + " 2 GENDER 954 non-null str \n", + " 3 Education 1071 non-null str \n", + " 4 Customer Lifetime Value 1068 non-null str \n", + " 5 Income 1071 non-null float64\n", + " 6 Monthly Premium Auto 1071 non-null float64\n", + " 7 Number of Open Complaints 1071 non-null str \n", + " 8 Policy Type 1071 non-null str \n", + " 9 Vehicle Class 1071 non-null str \n", + " 10 Total Claim Amount 1071 non-null float64\n", + "dtypes: float64(3), str(8)\n", + "memory usage: 417.4 KB\n" + ] + } + ], + "source": [ + "customer_data_insurance_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "2e0d2f9e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "0 RB50392 Washington NaN Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59% \n", + "2 AI49188 Nevada F Bachelor 1288743.17% \n", + "3 WW63253 California M Bachelor 764586.18% \n", + "4 GA49547 Washington M High School or Below 536307.65% \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " Vehicle Class Total Claim Amount \n", + "0 Four-Door Car 2.704934 \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "3 SUV 529.881344 \n", + "4 Four-Door Car 17.269323 " + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customer_data_insurance_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "54b042b2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsTotal Claim AmountPolicy TypeVehicle Class
0GS98873ArizonaFBachelor323912.47%16061881/0/00633.6Personal AutoFour-Door Car
1CW49887CaliforniaFMaster462680.11%794871141/0/00547.2Special AutoSUV
2MY31220CaliforniaFCollege899704.02%542301121/0/00537.6Personal AutoTwo-Door Car
3UH35128OregonFCollege2580706.30%712102141/1/001027.2Personal AutoLuxury Car
4WH52799ArizonaFCollege380812.21%94903941/0/00451.2Corporate AutoTwo-Door Car
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value Income \\\n", + "0 GS98873 Arizona F Bachelor 323912.47% 16061 \n", + "1 CW49887 California F Master 462680.11% 79487 \n", + "2 MY31220 California F College 899704.02% 54230 \n", + "3 UH35128 Oregon F College 2580706.30% 71210 \n", + "4 WH52799 Arizona F College 380812.21% 94903 \n", + "\n", + " Monthly Premium Auto Number of Open Complaints Total Claim Amount \\\n", + "0 88 1/0/00 633.6 \n", + "1 114 1/0/00 547.2 \n", + "2 112 1/0/00 537.6 \n", + "3 214 1/1/00 1027.2 \n", + "4 94 1/0/00 451.2 \n", + "\n", + " Policy Type Vehicle Class \n", + "0 Personal Auto Four-Door Car \n", + "1 Special Auto SUV \n", + "2 Personal Auto Two-Door Car \n", + "3 Personal Auto Luxury Car \n", + "4 Corporate Auto Two-Door Car " + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customer_data_insurance2_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "3d42ddc9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerStateCustomer Lifetime ValueEducationGenderIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeTotal Claim AmountVehicle Class
0SA25987Washington3479.137523High School or BelowM01040Personal Auto499.200000Two-Door Car
1TB86706Arizona2502.637401MasterM0660Personal Auto3.468912Two-Door Car
2ZL73902Nevada3265.156348BachelorF25820820Personal Auto393.600000Four-Door Car
3KX23516California4455.843406High School or BelowF01210Personal Auto699.615192SUV
4FN77294California7704.958480High School or BelowM303661012Personal Auto484.800000SUV
\n", + "
" + ], + "text/plain": [ + " Customer State Customer Lifetime Value Education Gender \\\n", + "0 SA25987 Washington 3479.137523 High School or Below M \n", + "1 TB86706 Arizona 2502.637401 Master M \n", + "2 ZL73902 Nevada 3265.156348 Bachelor F \n", + "3 KX23516 California 4455.843406 High School or Below F \n", + "4 FN77294 California 7704.958480 High School or Below M \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "0 0 104 0 Personal Auto \n", + "1 0 66 0 Personal Auto \n", + "2 25820 82 0 Personal Auto \n", + "3 0 121 0 Personal Auto \n", + "4 30366 101 2 Personal Auto \n", + "\n", + " Total Claim Amount Vehicle Class \n", + "0 499.200000 Two-Door Car \n", + "1 3.468912 Two-Door Car \n", + "2 393.600000 Four-Door Car \n", + "3 699.615192 SUV \n", + "4 484.800000 SUV " + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customer_data_insurance3_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "8665654f", + "metadata": {}, + "outputs": [], + "source": [ + "all_customer_data_df = pd.concat([customer_data_insurance_df, customer_data_insurance2_df, customer_data_insurance3_df], axis=0, ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "8750b6da", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 12074 entries, 0 to 12073\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Customer 9137 non-null str \n", + " 1 ST 2067 non-null str \n", + " 2 GENDER 1945 non-null str \n", + " 3 Education 9137 non-null str \n", + " 4 Customer Lifetime Value 9130 non-null object \n", + " 5 Income 9137 non-null float64\n", + " 6 Monthly Premium Auto 9137 non-null float64\n", + " 7 Number of Open Complaints 9137 non-null object \n", + " 8 Policy Type 9137 non-null str \n", + " 9 Vehicle Class 9137 non-null str \n", + " 10 Total Claim Amount 9137 non-null float64\n", + " 11 State 7070 non-null str \n", + " 12 Gender 7070 non-null str \n", + "dtypes: float64(3), object(2), str(8)\n", + "memory usage: 1.6+ MB\n" + ] + } + ], + "source": [ + "all_customer_data_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "0bcbd3f3", + "metadata": {}, + "outputs": [], + "source": [ + "all_customer_data_df[\"state\"] = all_customer_data_df[\"ST\"].fillna(all_customer_data_df[\"State\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "dda0cee2", + "metadata": {}, + "outputs": [], + "source": [ + "all_customer_data_df = all_customer_data_df.drop([\"ST\", \"State\"], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "ba77d002", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 12074 entries, 0 to 12073\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Customer 9137 non-null str \n", + " 1 GENDER 1945 non-null str \n", + " 2 Education 9137 non-null str \n", + " 3 Customer Lifetime Value 9130 non-null object \n", + " 4 Income 9137 non-null float64\n", + " 5 Monthly Premium Auto 9137 non-null float64\n", + " 6 Number of Open Complaints 9137 non-null object \n", + " 7 Policy Type 9137 non-null str \n", + " 8 Vehicle Class 9137 non-null str \n", + " 9 Total Claim Amount 9137 non-null float64\n", + " 10 Gender 7070 non-null str \n", + " 11 state 9137 non-null str \n", + "dtypes: float64(3), object(2), str(7)\n", + "memory usage: 1.5+ MB\n" + ] + } + ], + "source": [ + "all_customer_data_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "ecc399b2", + "metadata": {}, + "outputs": [], + "source": [ + "all_customer_data_df[\"gender\"] = all_customer_data_df[\"GENDER\"].fillna(all_customer_data_df[\"Gender\"])\n", + "all_customer_data_df = all_customer_data_df.drop([\"GENDER\", \"Gender\"], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "76a0d342", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 12074 entries, 0 to 12073\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Customer 9137 non-null str \n", + " 1 Education 9137 non-null str \n", + " 2 Customer Lifetime Value 9130 non-null object \n", + " 3 Income 9137 non-null float64\n", + " 4 Monthly Premium Auto 9137 non-null float64\n", + " 5 Number of Open Complaints 9137 non-null object \n", + " 6 Policy Type 9137 non-null str \n", + " 7 Vehicle Class 9137 non-null str \n", + " 8 Total Claim Amount 9137 non-null float64\n", + " 9 state 9137 non-null str \n", + " 10 gender 9015 non-null str \n", + "dtypes: float64(3), object(2), str(6)\n", + "memory usage: 1.5+ MB\n" + ] + } + ], + "source": [ + "all_customer_data_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "5dbdc199", + "metadata": {}, + "outputs": [], + "source": [ + "def cle_for(df, state_conversion, gender_conversion):\n", + " # Cleaning columns\n", + " df.columns = df.columns.str.lower().str.replace(' ', '_')\n", + "\n", + " # Drop rows where all elements are NaN\n", + " df = df.dropna(axis=0, how=\"all\")\n", + " \n", + " # Map and replace categories\n", + " df[\"state\"] = df[\"state\"].map(state_conversion).fillna(df[\"state\"])\n", + " df[\"gender\"] = df[\"gender\"].map(gender_conversion).fillna(df[\"gender\"])\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "0b5b7843", + "metadata": {}, + "outputs": [], + "source": [ + "state_conversion = {\n", + " 'Cali': 'California', 'AZ': 'Arizona', 'WA': 'Washington',\n", + " 'Washington': 'Washington', 'Arizona': 'Arizona', 'Nevada': 'Nevada', 'California': 'California', 'Oregon': 'Oregon'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "b8ddc601", + "metadata": {}, + "outputs": [], + "source": [ + "gender_conversion = {\n", + " \"Male\": \"M\", \"Female\": \"F\", \"Femal\": \"F\", \"female\": \"F\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "597c88ef", + "metadata": {}, + "outputs": [], + "source": [ + "cleaned_customer_data_insurance_df = cle_for(all_customer_data_df, state_conversion, gender_conversion)\n", + "cleaned_customer_data_insurance_df = cleaned_customer_data_insurance_df.reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "fb9ce740", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['customer', 'education', 'customer_lifetime_value', 'income',\n", + " 'monthly_premium_auto', 'number_of_open_complaints', 'policy_type',\n", + " 'vehicle_class', 'total_claim_amount', 'state', 'gender'],\n", + " dtype='str')" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaned_customer_data_insurance_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "2e30f9c2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "['Master', 'Bachelor', 'High School or Below', 'College', 'Bachelors',\n", + " 'Doctor']\n", + "Length: 6, dtype: str" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaned_customer_data_insurance_df[\"education\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "b69e1b84", + "metadata": {}, + "outputs": [], + "source": [ + "cleaned_customer_data_insurance_df[\"education\"] = cleaned_customer_data_insurance_df[\"education\"].replace({\n", + " \"Bachelors\": \"Bachelor\"\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "c08248d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "['Master', 'Bachelor', 'High School or Below', 'College', 'Doctor']\n", + "Length: 5, dtype: str" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaned_customer_data_insurance_df[\"education\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "12edcfbf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([nan, '697953.59%', '1288743.17%', ..., 8163.890428, 7524.442436,\n", + " 2611.836866], shape=(8212,), dtype=object)" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaned_customer_data_insurance_df[\"customer_lifetime_value\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "9a672f79", + "metadata": {}, + "outputs": [], + "source": [ + "cleaned_customer_data_insurance_df[\"customer_lifetime_value\"] = cleaned_customer_data_insurance_df[\"customer_lifetime_value\"].str.replace('%', '', regex=False)\n", + "cleaned_customer_data_insurance_df[\"customer_lifetime_value\"] = pd.to_numeric(\n", + " cleaned_customer_data_insurance_df[\"customer_lifetime_value\"], errors='coerce'\n", + " )\n", + "median_value = cleaned_customer_data_insurance_df[\"customer_lifetime_value\"].median()\n", + "cleaned_customer_data_insurance_df[\"customer_lifetime_value\"] = cleaned_customer_data_insurance_df[\"customer_lifetime_value\"].fillna(median_value)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "84d0cc05", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 572027.54, 697953.59, 1288743.17, ..., 568964.41, 368672.38,\n", + " 399258.39], shape=(1924,))" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaned_customer_data_insurance_df[\"customer_lifetime_value\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "id": "422095b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['1/0/00', '1/2/00', '1/1/00', '1/3/00', '1/5/00', '1/4/00', 0, 2,\n", + " 3, 1, 5, 4], dtype=object)" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaned_customer_data_insurance_df[\"number_of_open_complaints\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "id": "46e84fed", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('O')" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaned_customer_data_insurance_df[\"number_of_open_complaints\"].dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "bbb68ce2", + "metadata": {}, + "outputs": [], + "source": [ + "cleaned_customer_data_insurance_df[\"number_of_open_complaints\"] = cleaned_customer_data_insurance_df[\"number_of_open_complaints\"].astype(str)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "f78d0b0f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaned_customer_data_insurance_df[\"number_of_open_complaints\"].dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "id": "b97e6ab9", + "metadata": {}, + "outputs": [], + "source": [ + "def convert_values(x):\n", + " if pd.isnull(x):\n", + " return pd.NA # Handle missing values\n", + " if isinstance(x, str) and \"/\" in x:\n", + " return int(x.split(\"/\")[1]) # Extract the second part of the split string\n", + " try:\n", + " return int(x) # Directly convert numeric strings or numbers\n", + " except ValueError:\n", + " return pd.NA # In case of any unexpected values\n", + "\n", + "cleaned_customer_data_insurance_df[\"number_of_open_complaints\"] = (\n", + " cleaned_customer_data_insurance_df[\"number_of_open_complaints\"]\n", + " .apply(convert_values)\n", + " .astype(pd.Int64Dtype())\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "45d02bff", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "[0, 2, 1, 3, 5, 4]\n", + "Length: 6, dtype: Int64" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaned_customer_data_insurance_df[\"number_of_open_complaints\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "d0f98c0f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['customer', 'education', 'customer_lifetime_value', 'income',\n", + " 'monthly_premium_auto', 'number_of_open_complaints', 'policy_type',\n", + " 'vehicle_class', 'total_claim_amount', 'state', 'gender'],\n", + " dtype='str')" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaned_customer_data_insurance_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "id": "111101c0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "['Personal Auto', 'Corporate Auto', 'Special Auto']\n", + "Length: 3, dtype: str" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaned_customer_data_insurance_df[\"policy_type\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "295d4ee8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "['Four-Door Car', 'Two-Door Car', 'SUV', 'Luxury SUV',\n", + " 'Sports Car', 'Luxury Car']\n", + "Length: 6, dtype: str" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaned_customer_data_insurance_df[\"vehicle_class\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "id": "f930b71e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "['Four-Door Car', 'Two-Door Car', 'SUV', 'Luxury']\n", + "Length: 4, dtype: str" + ] + }, + "execution_count": 111, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaned_customer_data_insurance_df[\"vehicle_class\"] = cleaned_customer_data_insurance_df[\"vehicle_class\"].replace(\n", + " {\"Sports Car\": \"Luxury\", \"Luxury SUV\": \"Luxury\", \"Luxury Car\": \"Luxury\"}\n", + ")\n", + "cleaned_customer_data_insurance_df[\"vehicle_class\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "id": "29493aa9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "[nan, 'F', 'M']\n", + "Length: 3, dtype: str" + ] + }, + "execution_count": 112, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaned_customer_data_insurance_df[\"gender\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "5bbcd977", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "['Washington', 'Arizona', 'Nevada', 'California', 'Oregon']\n", + "Length: 5, dtype: str" + ] + }, + "execution_count": 120, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaned_customer_data_insurance_df[\"state\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "id": "98a88b39", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 9137 entries, 0 to 9136\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 customer 9137 non-null str \n", + " 1 education 9137 non-null str \n", + " 2 customer_lifetime_value 9137 non-null float64\n", + " 3 income 9137 non-null float64\n", + " 4 monthly_premium_auto 9137 non-null float64\n", + " 5 number_of_open_complaints 9137 non-null Int64 \n", + " 6 policy_type 9137 non-null str \n", + " 7 vehicle_class 9137 non-null str \n", + " 8 total_claim_amount 9137 non-null float64\n", + " 9 state 9137 non-null str \n", + " 10 gender 9015 non-null str \n", + "dtypes: Int64(1), float64(4), str(6)\n", + "memory usage: 1.2 MB\n" + ] + } + ], + "source": [ + "cleaned_customer_data_insurance_df.info()" + ] + }, + { + "cell_type": "markdown", + "id": "31b8a9e7-7db9-4604-991b-ef6771603e57", + "metadata": { + "id": "31b8a9e7-7db9-4604-991b-ef6771603e57" + }, + "source": [ + "# Challenge 2: Structuring Data" + ] + }, + { + "cell_type": "markdown", + "id": "a877fd6d-7a0c-46d2-9657-f25036e4ca4b", + "metadata": { + "id": "a877fd6d-7a0c-46d2-9657-f25036e4ca4b" + }, + "source": [ + "In this challenge, we will continue to work with customer data from an insurance company, but we will use a dataset with more columns, called marketing_customer_analysis.csv, which can be found at the following link:\n", + "\n", + "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\n", + "\n", + "This dataset contains information such as customer demographics, policy details, vehicle information, and the customer's response to the last marketing campaign. Our goal is to explore and analyze this data by performing data cleaning, formatting, and structuring." + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", + "metadata": { + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" + }, + "outputs": [], + "source": [ + "marketing_customer_analysis_df = pd.read_csv(\"marketing_customer_analysis_clean.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "id": "5c39f213", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "unnamed:_0 10910\n", + "customer 9134\n", + "state 5\n", + "customer_lifetime_value 8041\n", + "response 2\n", + "coverage 3\n", + "education 5\n", + "effective_to_date 59\n", + "employmentstatus 5\n", + "gender 2\n", + "income 5694\n", + "location_code 3\n", + "marital_status 3\n", + "monthly_premium_auto 202\n", + "months_since_last_claim 37\n", + "months_since_policy_inception 100\n", + "number_of_open_complaints 7\n", + "number_of_policies 9\n", + "policy_type 3\n", + "policy 9\n", + "renew_offer_type 4\n", + "sales_channel 4\n", + "total_claim_amount 5106\n", + "vehicle_class 6\n", + "vehicle_size 3\n", + "vehicle_type 1\n", + "month 2\n", + "dtype: int64" + ] + }, + "execution_count": 119, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "marketing_customer_analysis_df.nunique()" + ] + }, + { + "cell_type": "markdown", + "id": "df35fd0d-513e-4e77-867e-429da10a9cc7", + "metadata": { + "id": "df35fd0d-513e-4e77-867e-429da10a9cc7" + }, + "source": [ + "1. You work at the marketing department and you want to know which sales channel brought the most sales in terms of total revenue. Using pivot, create a summary table showing the total revenue for each sales channel (branch, call center, web, and mail).\n", + "Round the total revenue to 2 decimal points. Analyze the resulting table to draw insights." + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "id": "80a6f4b4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " monthly_premium_auto\n", + "sales_channel \n", + "Agent 386335\n", + "Branch 280953\n", + "Call Center 197970\n", + "Web 151511\n" + ] + } + ], + "source": [ + "premium_pivot_table = marketing_customer_analysis_df.pivot_table(\n", + " values='monthly_premium_auto',\n", + " index='sales_channel',\n", + " aggfunc='sum'\n", + ")\n", + "\n", + "premium_pivot_table = premium_pivot_table.round(2)\n", + "\n", + "print(premium_pivot_table)" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "e6c88014", + "metadata": {}, + "outputs": [], + "source": [ + "#The data might hint at market preferences that skew toward personal interaction.\n", + "# This could be critical information when planning marketing campaigns or customer interaction strategies." + ] + }, + { + "cell_type": "markdown", + "id": "640993b2-a291-436c-a34d-a551144f8196", + "metadata": { + "id": "640993b2-a291-436c-a34d-a551144f8196" + }, + "source": [ + "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights." + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "8277be56", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "education Bachelor College Doctor High School or Below Master\n", + "gender \n", + "F 7874.27 7748.82 7328.51 8675.22 8157.05\n", + "M 7703.60 8052.46 7415.33 8149.69 8168.83\n" + ] + } + ], + "source": [ + "pivot_table_clv = marketing_customer_analysis_df.pivot_table(\n", + " values='customer_lifetime_value',\n", + " index='gender',\n", + " columns='education',\n", + " aggfunc='mean'\n", + ")\n", + "\n", + "pivot_table_clv = pivot_table_clv.round(2)\n", + "\n", + "print(pivot_table_clv)" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "id": "7ffb4609", + "metadata": {}, + "outputs": [], + "source": [ + "#Given the higher CLVs among females with \"High School or Below\" and males with \"Masters,\" these groups could be prime targets for retention strategies and upselling higher insurance packages or additional service offerings." + ] + }, + { + "cell_type": "markdown", + "id": "32c7f2e5-3d90-43e5-be33-9781b6069198", + "metadata": { + "id": "32c7f2e5-3d90-43e5-be33-9781b6069198" + }, + "source": [ + "## Bonus\n", + "\n", + "You work at the customer service department and you want to know which months had the highest number of complaints by policy type category. Create a summary table showing the number of complaints by policy type and month.\n", + "Show it in a long format table." + ] + }, + { + "cell_type": "markdown", + "id": "e3d09a8f-953c-448a-a5f8-2e5a8cca7291", + "metadata": { + "id": "e3d09a8f-953c-448a-a5f8-2e5a8cca7291" + }, + "source": [ + "*In data analysis, a long format table is a way of structuring data in which each observation or measurement is stored in a separate row of the table. The key characteristic of a long format table is that each column represents a single variable, and each row represents a single observation of that variable.*\n", + "\n", + "*More information about long and wide format tables here: https://www.statology.org/long-vs-wide-data/*" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "3a069e0b-b400-470e-904d-d17582191be4", + "metadata": { + "id": "3a069e0b-b400-470e-904d-d17582191be4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " policy_type month number_of_complaints\n", + "0 Corporate Auto 1 1252\n", + "1 Corporate Auto 2 1089\n", + "2 Personal Auto 1 4329\n", + "3 Personal Auto 2 3799\n", + "4 Special Auto 1 237\n", + "5 Special Auto 2 204\n" + ] + } + ], + "source": [ + "df_complaints = marketing_customer_analysis_df.groupby(['policy_type', 'month']).size().reset_index(name='number_of_complaints')\n", + "\n", + "print(df_complaints)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}