{"cells":[{"attachments":{},"cell_type":"markdown","metadata":{"id":"QMRYNYL7eTsD"},"source":["# Medical Insurance Cost prediction\n","\n","This exercise is about performing some of the steps described in the notebook for the California Housing Data on another dataset for Medical Insurance Cost prediction."]},{"attachments":{},"cell_type":"markdown","metadata":{"id":"LAHyKeqahC0j"},"source":["# Get the Data"]},{"cell_type":"code","execution_count":572,"metadata":{"executionInfo":{"elapsed":398,"status":"ok","timestamp":1682900855852,"user":{"displayName":"Alex Thomo","userId":"08504196803322236588"},"user_tz":420},"id":"YDh9HJxCZIlZ"},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","\n","medical = pd.read_csv(\"https://bit.ly/44evDuW\")"]},{"attachments":{},"cell_type":"markdown","metadata":{"id":"lV47LyGfhPwR"},"source":["# Take a Quick Look at the Data Structure"]},{"cell_type":"code","execution_count":573,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"executionInfo":{"elapsed":27,"status":"ok","timestamp":1682900855853,"user":{"displayName":"Alex Thomo","userId":"08504196803322236588"},"user_tz":420},"id":"3JL7eYP3ZfsX","outputId":"ce8eb8cb-0cb9-4216-bf31-0fe43c912934"},"outputs":[{"data":{"text/html":["
\n"," | age | \n","sex | \n","bmi | \n","children | \n","smoker | \n","region | \n","charges | \n","
---|---|---|---|---|---|---|---|
0 | \n","19 | \n","female | \n","27.900 | \n","0 | \n","yes | \n","southwest | \n","16884.92400 | \n","
1 | \n","18 | \n","male | \n","33.770 | \n","1 | \n","no | \n","southeast | \n","1725.55230 | \n","
2 | \n","28 | \n","male | \n","33.000 | \n","3 | \n","no | \n","southeast | \n","4449.46200 | \n","
3 | \n","33 | \n","male | \n","22.705 | \n","0 | \n","no | \n","northwest | \n","21984.47061 | \n","
4 | \n","32 | \n","male | \n","28.880 | \n","0 | \n","no | \n","northwest | \n","3866.85520 | \n","
\n"," | age | \n","bmi | \n","children | \n","charges | \n","
---|---|---|---|---|
count | \n","1338.000000 | \n","1338.000000 | \n","1338.000000 | \n","1338.000000 | \n","
mean | \n","39.207025 | \n","30.663397 | \n","1.094918 | \n","13270.422265 | \n","
std | \n","14.049960 | \n","6.098187 | \n","1.205493 | \n","12110.011237 | \n","
min | \n","18.000000 | \n","15.960000 | \n","0.000000 | \n","1121.873900 | \n","
25% | \n","27.000000 | \n","26.296250 | \n","0.000000 | \n","4740.287150 | \n","
50% | \n","39.000000 | \n","30.400000 | \n","1.000000 | \n","9382.033000 | \n","
75% | \n","51.000000 | \n","34.693750 | \n","2.000000 | \n","16639.912515 | \n","
max | \n","64.000000 | \n","53.130000 | \n","5.000000 | \n","63770.428010 | \n","
\n"," | age | \n","bmi | \n","children | \n","charges | \n","
---|---|---|---|---|
age | \n","1.000000 | \n","0.118274 | \n","0.060999 | \n","0.281721 | \n","
bmi | \n","0.118274 | \n","1.000000 | \n","-0.005040 | \n","0.197316 | \n","
children | \n","0.060999 | \n","-0.005040 | \n","1.000000 | \n","0.071885 | \n","
charges | \n","0.281721 | \n","0.197316 | \n","0.071885 | \n","1.000000 | \n","
Pipeline(steps=[('columntransformer',\n"," ColumnTransformer(transformers=[('num',\n"," Pipeline(steps=[('simpleimputer',\n"," SimpleImputer(strategy='median')),\n"," ('standardscaler',\n"," StandardScaler())]),\n"," ['age', 'bmi', 'children']),\n"," ('cat',\n"," Pipeline(steps=[('simpleimputer',\n"," SimpleImputer(strategy='most_frequent')),\n"," ('onehotencoder',\n"," OneHotEncoder(handle_unknown='ignore'))]),\n"," ['sex', 'smoker',\n"," 'region'])])),\n"," ('linearregression', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('columntransformer',\n"," ColumnTransformer(transformers=[('num',\n"," Pipeline(steps=[('simpleimputer',\n"," SimpleImputer(strategy='median')),\n"," ('standardscaler',\n"," StandardScaler())]),\n"," ['age', 'bmi', 'children']),\n"," ('cat',\n"," Pipeline(steps=[('simpleimputer',\n"," SimpleImputer(strategy='most_frequent')),\n"," ('onehotencoder',\n"," OneHotEncoder(handle_unknown='ignore'))]),\n"," ['sex', 'smoker',\n"," 'region'])])),\n"," ('linearregression', LinearRegression())])
ColumnTransformer(transformers=[('num',\n"," Pipeline(steps=[('simpleimputer',\n"," SimpleImputer(strategy='median')),\n"," ('standardscaler',\n"," StandardScaler())]),\n"," ['age', 'bmi', 'children']),\n"," ('cat',\n"," Pipeline(steps=[('simpleimputer',\n"," SimpleImputer(strategy='most_frequent')),\n"," ('onehotencoder',\n"," OneHotEncoder(handle_unknown='ignore'))]),\n"," ['sex', 'smoker', 'region'])])
['age', 'bmi', 'children']
SimpleImputer(strategy='median')
StandardScaler()
['sex', 'smoker', 'region']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
LinearRegression()
RandomizedSearchCV(cv=3,\n"," estimator=Pipeline(steps=[('preprocessing',\n"," ColumnTransformer(transformers=[('num',\n"," Pipeline(steps=[('simpleimputer',\n"," SimpleImputer(strategy='median')),\n"," ('standardscaler',\n"," StandardScaler())]),\n"," ['age',\n"," 'bmi',\n"," 'children']),\n"," ('cat',\n"," Pipeline(steps=[('simpleimputer',\n"," SimpleImputer(strategy='most_frequent')),\n"," ('onehotencoder',\n"," OneHotEncoder(handle_unknown='ignore'))]),\n"," ['sex',\n"," 'smoker',\n"," 'region'])])),\n"," ('random_forest',\n"," RandomForestRegressor(random_state=42))]),\n"," param_distributions={'random_forest__max_features': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001C52DA6D040>},\n"," random_state=42, scoring='neg_root_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomizedSearchCV(cv=3,\n"," estimator=Pipeline(steps=[('preprocessing',\n"," ColumnTransformer(transformers=[('num',\n"," Pipeline(steps=[('simpleimputer',\n"," SimpleImputer(strategy='median')),\n"," ('standardscaler',\n"," StandardScaler())]),\n"," ['age',\n"," 'bmi',\n"," 'children']),\n"," ('cat',\n"," Pipeline(steps=[('simpleimputer',\n"," SimpleImputer(strategy='most_frequent')),\n"," ('onehotencoder',\n"," OneHotEncoder(handle_unknown='ignore'))]),\n"," ['sex',\n"," 'smoker',\n"," 'region'])])),\n"," ('random_forest',\n"," RandomForestRegressor(random_state=42))]),\n"," param_distributions={'random_forest__max_features': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001C52DA6D040>},\n"," random_state=42, scoring='neg_root_mean_squared_error')
Pipeline(steps=[('preprocessing',\n"," ColumnTransformer(transformers=[('num',\n"," Pipeline(steps=[('simpleimputer',\n"," SimpleImputer(strategy='median')),\n"," ('standardscaler',\n"," StandardScaler())]),\n"," ['age', 'bmi', 'children']),\n"," ('cat',\n"," Pipeline(steps=[('simpleimputer',\n"," SimpleImputer(strategy='most_frequent')),\n"," ('onehotencoder',\n"," OneHotEncoder(handle_unknown='ignore'))]),\n"," ['sex', 'smoker',\n"," 'region'])])),\n"," ('random_forest', RandomForestRegressor(random_state=42))])
ColumnTransformer(transformers=[('num',\n"," Pipeline(steps=[('simpleimputer',\n"," SimpleImputer(strategy='median')),\n"," ('standardscaler',\n"," StandardScaler())]),\n"," ['age', 'bmi', 'children']),\n"," ('cat',\n"," Pipeline(steps=[('simpleimputer',\n"," SimpleImputer(strategy='most_frequent')),\n"," ('onehotencoder',\n"," OneHotEncoder(handle_unknown='ignore'))]),\n"," ['sex', 'smoker', 'region'])])
['age', 'bmi', 'children']
SimpleImputer(strategy='median')
StandardScaler()
['sex', 'smoker', 'region']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
RandomForestRegressor(random_state=42)