import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble._forest import RandomForestClassifier
#from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
#get model duration
import time
from datetime import date
#use el siguiente script para ver la salida de todo el código.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_Interactivity = 'all'
insurance = pd.read_csv("insurance.csv")
Age: edad del beneficiario principal
sex: sexo del contratista de seguros, mujer, hombre
BMI: índice de masa corporal, que proporciona una comprensión del cuerpo, pesos relativamente altos o bajos en relación con la altura, índice objetivo de peso corporal (kg / m ^ 2) utilizando la relación entre la altura y el peso, idealmente 18,5 a 24,9
Children: Número de niños cubiertos por seguro médico / Número de dependientes.
Smoker: Fumar
Region: Área residencial del beneficiario en los EE. UU., Noreste, sureste, suroeste, noroeste.
Charges: Costos médicos individuales facturados por el seguro médico.
insurance.shape
(1338, 7)
insurance.describe()
age | bmi | children | charges | |
---|---|---|---|---|
count | 1338.000000 | 1338.000000 | 1338.000000 | 1338.000000 |
mean | 39.207025 | 30.663397 | 1.094918 | 13270.422265 |
std | 14.049960 | 6.098187 | 1.205493 | 12110.011237 |
min | 18.000000 | 15.960000 | 0.000000 | 1121.873900 |
25% | 27.000000 | 26.296250 | 0.000000 | 4740.287150 |
50% | 39.000000 | 30.400000 | 1.000000 | 9382.033000 |
75% | 51.000000 | 34.693750 | 2.000000 | 16639.912515 |
max | 64.000000 | 53.130000 | 5.000000 | 63770.428010 |
insurance.dtypes
age int64 sex object bmi float64 children int64 smoker object region object charges float64 dtype: object
insurance.head()
age | sex | bmi | children | smoker | region | charges | |
---|---|---|---|---|---|---|---|
0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
insurance.isnull().sum()
age 0 sex 0 bmi 0 children 0 smoker 0 region 0 charges 0 dtype: int64
total_miss = insurance.isnull().any() #Parece que no faltan valores.
total_miss
age False sex False bmi False children False smoker False region False charges False dtype: bool
insurance.corr()
age | bmi | children | charges | |
---|---|---|---|---|
age | 1.000000 | 0.109272 | 0.042469 | 0.299008 |
bmi | 0.109272 | 1.000000 | 0.012759 | 0.198341 |
children | 0.042469 | 0.012759 | 1.000000 | 0.067998 |
charges | 0.299008 | 0.198341 | 0.067998 | 1.000000 |
sns.heatmap(insurance.corr(), annot=True)
<AxesSubplot:>
from scipy import stats
from warnings import filterwarnings
filterwarnings('ignore')
g = sns.JointGrid(insurance['age'], insurance['charges'])
g = g.plot(sns.regplot, sns.histplot).annotate(stats.pearsonr)
plt.show()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_5072/2057381814.py in <module> 4 5 g = sns.JointGrid(insurance['age'], insurance['charges']) ----> 6 g = g.plot(sns.regplot, sns.histplot).annotate(stats.pearsonr) 7 plt.show() AttributeError: 'JointGrid' object has no attribute 'annotate'
bins = [0,1,3,5]
labels = ['Small_Family','Medium_Family','Large_Family']
insurance['children_binned'] = pd.cut(insurance['children'], bins=bins, labels=labels)
sns.boxplot(x = 'children_binned',y='charges', data = insurance)
insurance.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1338 entries, 0 to 1337 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 1338 non-null int64 1 sex 1338 non-null object 2 bmi 1338 non-null float64 3 children 1338 non-null int64 4 smoker 1338 non-null object 5 region 1338 non-null object 6 charges 1338 non-null float64 7 children_binned 764 non-null category dtypes: category(1), float64(2), int64(2), object(3) memory usage: 74.7+ KB
for col in ['sex', 'smoker', 'region']:
print( col,':')
print(insurance[col].value_counts())
sex : male 676 female 662 Name: sex, dtype: int64 smoker : no 1064 yes 274 Name: smoker, dtype: int64 region : southeast 364 southwest 325 northwest 325 northeast 324 Name: region, dtype: int64
for col in ['sex', 'smoker', 'region']:
if (insurance[col].dtype == 'object'):
le = preprocessing.LabelEncoder()
le = le.fit(insurance[col])
insurance[col] = le.transform(insurance[col])
print('Completed Label encoding on',col)
Completed Label encoding on sex Completed Label encoding on smoker Completed Label encoding on region
insurance.head()
age | sex | bmi | children | smoker | region | charges | children_binned | |
---|---|---|---|---|---|---|---|---|
0 | 19 | 0 | 27.900 | 0 | 1 | 3 | 16884.92400 | NaN |
1 | 18 | 1 | 33.770 | 1 | 0 | 2 | 1725.55230 | Small_Family |
2 | 28 | 1 | 33.000 | 3 | 0 | 2 | 4449.46200 | Medium_Family |
3 | 33 | 1 | 22.705 | 0 | 0 | 1 | 21984.47061 | NaN |
4 | 32 | 1 | 28.880 | 0 | 0 | 1 | 3866.85520 | NaN |
La salida de arriba muestra el cambio de etiquetas de los objetos 'sex', 'smoker', 'region' que tenian valores no estructurados, y se lo cambio a valores categoricos como 0,1,2,3
insurance.corr()
age | sex | bmi | children | smoker | region | charges | |
---|---|---|---|---|---|---|---|
age | 1.000000 | -0.020856 | 0.109272 | 0.042469 | -0.025019 | 0.002127 | 0.299008 |
sex | -0.020856 | 1.000000 | 0.046371 | 0.017163 | 0.076185 | 0.004588 | 0.057292 |
bmi | 0.109272 | 0.046371 | 1.000000 | 0.012759 | 0.003750 | 0.157566 | 0.198341 |
children | 0.042469 | 0.017163 | 0.012759 | 1.000000 | 0.007673 | 0.016569 | 0.067998 |
smoker | -0.025019 | 0.076185 | 0.003750 | 0.007673 | 1.000000 | -0.002181 | 0.787251 |
region | 0.002127 | 0.004588 | 0.157566 | 0.016569 | -0.002181 | 1.000000 | -0.006208 |
charges | 0.299008 | 0.057292 | 0.198341 | 0.067998 | 0.787251 | -0.006208 | 1.000000 |
sns.heatmap(insurance.corr(), annot=True)
<AxesSubplot:>
insurance.dtypes
age int64 sex int32 bmi float64 children int64 smoker int32 region int32 charges float64 children_binned category dtype: object
sns.plot = sns.distplot(insurance['charges'])
for col in ['sex', 'smoker', 'region']:
print( col,':')
print(insurance[col].value_counts())
sex : 1 676 0 662 Name: sex, dtype: int64 smoker : 0 1064 1 274 Name: smoker, dtype: int64 region : 2 364 3 325 1 325 0 324 Name: region, dtype: int64
El resumen puede verse a continuación:
sex :
smoker :
region :
insurance.to_csv('insurance_encoded.csv', index = False)
insurance_input = insurance.drop(['charges', 'children_binned'],axis=1)
insurance_target = insurance['charges']
x_scaled = StandardScaler().fit_transform(insurance_input)
x_train, x_test, y_train, y_test = train_test_split(x_scaled,
insurance_target,
test_size = 0.25,
random_state=1211)
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
#Inicialización
linReg = LinearRegression()
start_time = time.time()
# Ajustar el modelo lineal a el conjunto de datos de entrenamiento
linReg_model = linReg.fit(x_train, y_train)
today = date.today()
print("--- %s seconds ---" % (time.time() - start_time))
--- 0.348020076751709 seconds ---
linReg.coef_
array([3690.94980639, -154.24448937, 1926.27073861, 520.55396204, 9697.07408404, -410.85583285])
A lo largo del módulo, designamos el vector w = (w_1,
..., wp) como `coefy w_0 como
intercept_`.
input_columns = insurance.columns[:6] #Obtener nombres de las caracteristicas
coeff_df = pd.DataFrame(linReg.coef_, input_columns, columns=['Coefficient'])
coeff_df
Coefficient | |
---|---|
age | 3690.949806 |
sex | -154.244489 |
bmi | 1926.270739 |
children | 520.553962 |
smoker | 9697.074084 |
region | -410.855833 |
smoker
, age
y bmi
tienen el mayor efecto en los cargos del seguro,¶lo cual es correcto porque incluso en el mundo real: la edad, la obesidad y los hábitos de fumar tienen una influencia significativa sobre los riesgos para la salud. y, por tanto, mayores gastos de seguro.
y_pred_train = linReg.predict(x_train) # Predict on train data.
y_pred_train[y_pred_train < 0] = y_pred_train.mean()
y_pred = linReg.predict(x_test) # Predict on test data.
y_pred[y_pred < 0] = y_pred.mean()
diff = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
diff.head(5)
Actual | Predicted | |
---|---|---|
926 | 2913.56900 | 675.576998 |
490 | 1748.77400 | 2813.561034 |
1245 | 5615.36900 | 4312.605591 |
854 | 24106.91255 | 34277.985644 |
1002 | 1972.95000 | 1575.497282 |
diff.plot(kind='bar',figsize=(8,8))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()
diff1 = diff.head(10)
diff1
Actual | Predicted | |
---|---|---|
926 | 2913.56900 | 675.576998 |
490 | 1748.77400 | 2813.561034 |
1245 | 5615.36900 | 4312.605591 |
854 | 24106.91255 | 34277.985644 |
1002 | 1972.95000 | 1575.497282 |
448 | 5910.94400 | 7289.580969 |
475 | 28868.66390 | 37298.040845 |
1059 | 4462.72180 | 7388.135839 |
683 | 9863.47180 | 9473.012595 |
1278 | 22462.04375 | 32398.763860 |
diff1.plot(kind='bar',figsize=(8,8))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()
def calculate_accuracy(actual, predicted):
SST = 0
SSR = 0
SSE = 0
RMSE = 0
VIF = 0
RSqr = 0
MAE = 0
MAPE = 0
SST = sum((actual - np.mean(predicted))**2) # Calculate the SST
SSR = sum((predicted - np.mean(predicted))**2) # Calculate the SSR
SSE = sum((actual - predicted)**2) # Calculate the SSE
RMSE = np.sqrt((sum((predicted - actual)**2))/len(predicted)) # Calculate the RMSE
RSqr = 1 - (SSE/SST) # Calcualte the R_square
if RSqr != 1:
VIF = 1 / (1 - RSqr) # Calculate the VIF
#MAPE_house_price = mape(dtc_predict_train, y_train) # Calculate the MAPE
#MAE_house_price = mae(y_train, dtc_predict_train) # Calculate the MAE
return RMSE, RSqr, VIF
# Finding MAE, MSE and other, metrics.
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
# print the intercept and coefficients
print('Intercept: ',linReg.intercept_)
#print('r2 score: ',linReg.score(x_train, y_train))
#print('r2 score: ',linReg.score(x_test, y_test))
Mean Absolute Error: 4068.9236952525494 Mean Squared Error: 36334934.764302835 Root Mean Squared Error: 6027.846610880442 Intercept: 13207.388159129547
# calculate the accuracies
RMSE, RSqr, VIF = calculate_accuracy(y_train,y_pred_train)
print('Linear RMSE train = ',RMSE, 'R-Square train = ',RSqr, 'VIF train = ',VIF)
RMSE, RSqr, VIF = calculate_accuracy(y_test,y_pred)
print('Linear RMSE test = ',RMSE, 'R-Square test = ',RSqr, 'VIF test = ',VIF)
Linear RMSE train = 6200.042303094933 R-Square train = 0.7437907235802795 VIF train = 3.9030593036054095 Linear RMSE test = 6027.846610880442 R-Square test = 0.7328599025559049 VIF test = 3.743354178454142
!pip install statsmodels
Collecting statsmodels Using cached statsmodels-0.13.1-cp38-none-win_amd64.whl (9.4 MB) Requirement already satisfied: numpy>=1.17 in c:\users\tera\anaconda3\envs\py2m\lib\site-packages (from statsmodels) (1.21.4) Requirement already satisfied: pandas>=0.25 in c:\users\tera\anaconda3\envs\py2m\lib\site-packages (from statsmodels) (1.3.4) Requirement already satisfied: scipy>=1.3 in c:\users\tera\anaconda3\envs\py2m\lib\site-packages (from statsmodels) (1.7.2) Collecting patsy>=0.5.2 Using cached patsy-0.5.2-py2.py3-none-any.whl (233 kB) Requirement already satisfied: pytz>=2017.3 in c:\users\tera\anaconda3\envs\py2m\lib\site-packages (from pandas>=0.25->statsmodels) (2021.3) Requirement already satisfied: python-dateutil>=2.7.3 in c:\users\tera\anaconda3\envs\py2m\lib\site-packages (from pandas>=0.25->statsmodels) (2.8.2) Requirement already satisfied: six in c:\users\tera\anaconda3\envs\py2m\lib\site-packages (from patsy>=0.5.2->statsmodels) (1.16.0) Installing collected packages: patsy, statsmodels Successfully installed patsy-0.5.2 statsmodels-0.13.1
#Backward elimination to check for high significance feature
import statsmodels.api as sm
a = 0
b = 0
a, b = insurance_input.shape
insurance_input = np.append(arr = np.ones((a, 1)).astype(int), values = insurance_input, axis = 1)
print (insurance_input.shape)
insurance_input_opt = insurance_input[:, [0, 1, 2, 4]]
##OrdinaryLeastSquares
regressorOLS = sm.OLS(endog = insurance_target, exog = insurance_input_opt).fit()
regressorOLS.summary()
(1338, 7)
Dep. Variable: | charges | R-squared: | 0.096 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.094 |
Method: | Least Squares | F-statistic: | 47.43 |
Date: | Wed, 17 Nov 2021 | Prob (F-statistic): | 4.00e-29 |
Time: | 03:41:02 | Log-Likelihood: | -14410. |
No. Observations: | 1338 | AIC: | 2.883e+04 |
Df Residuals: | 1334 | BIC: | 2.885e+04 |
Df Model: | 3 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 1837.2804 | 1022.425 | 1.797 | 0.073 | -168.455 | 3843.016 |
x1 | 256.8610 | 22.458 | 11.437 | 0.000 | 212.804 | 300.918 |
x2 | 1515.1059 | 630.399 | 2.403 | 0.016 | 278.424 | 2751.788 |
x3 | 545.1609 | 261.732 | 2.083 | 0.037 | 31.709 | 1058.613 |
Omnibus: | 397.122 | Durbin-Watson: | 2.046 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 856.804 |
Skew: | 1.721 | Prob(JB): | 8.86e-187 |
Kurtosis: | 4.876 | Cond. No. | 139. |
#Backward elimination to check for high significance feature
import statsmodels.api as sm
a = 0
b = 0
a, b = x_scaled.shape
x_scaled = np.append(arr = np.ones((a, 1)).astype(int), values = x_scaled, axis = 1)
print (x_scaled.shape)
x_scaled_opt = x_scaled[:, [0, 1, 2, 4]]
##OrdinaryLeastSquares
regressorOLS = sm.OLS(endog = insurance_target, exog = x_scaled_opt).fit()
regressorOLS.summary()
(1338, 7)
Dep. Variable: | charges | R-squared: | 0.096 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.094 |
Method: | Least Squares | F-statistic: | 47.43 |
Date: | Wed, 17 Nov 2021 | Prob (F-statistic): | 4.00e-29 |
Time: | 03:42:02 | Log-Likelihood: | -14410. |
No. Observations: | 1338 | AIC: | 2.883e+04 |
Df Residuals: | 1334 | BIC: | 2.885e+04 |
Df Model: | 3 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 1.327e+04 | 315.062 | 42.120 | 0.000 | 1.27e+04 | 1.39e+04 |
x1 | 3607.5381 | 315.421 | 11.437 | 0.000 | 2988.764 | 4226.313 |
x2 | 757.5115 | 315.182 | 2.403 | 0.016 | 139.204 | 1375.819 |
x3 | 656.9418 | 315.398 | 2.083 | 0.037 | 38.211 | 1275.673 |
Omnibus: | 397.122 | Durbin-Watson: | 2.046 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 856.804 |
Skew: | 1.721 | Prob(JB): | 8.86e-187 |
Kurtosis: | 4.876 | Cond. No. | 1.05 |
from sklearn.tree import DecisionTreeRegressor
dtc = DecisionTreeRegressor(random_state=1)
# create the model
dtc.fit(x_train,y_train)
# prediction on train data
dtc_predict_train = dtc.predict(x_train)
# prediction on test data
dtc_predict_test = dtc.predict(x_test)
# calculate the accuracies
RMSE, RSqr, VIF = calculate_accuracy(y_train,dtc_predict_train)
print('Decision Tree RMSE train = ',RMSE, 'R-Square train = ',RSqr, 'VIF train = ',VIF)
RMSE, RSqr, VIF = calculate_accuracy(y_test,dtc_predict_test)
print('Decision Tree RMSE test = ',RMSE, 'R-Square test = ',RSqr, 'VIF test = ',VIF)
Decision Tree RMSE train = 279.1173376029252 R-Square train = 0.9994805465077318 VIF train = 1925.1001579244978 Decision Tree RMSE test = 6479.645615986229 R-Square test = 0.6912970796606894 VIF test = 3.239360349752606
from sklearn.ensemble import RandomForestRegressor
# Random forest model
rfc = RandomForestRegressor()
rfc.fit(x_train,y_train)
# prediction on train data
rfc_predict_train = rfc.predict(x_train)
# prediction on test data
rfc_predict_test = rfc.predict(x_test)
# calculate the accuracies
RMSE, RSqr, VIF = calculate_accuracy(y_train,rfc_predict_train)
print('Random Forest RMSE train = ',RMSE, 'R-Square train = ',RSqr, 'VIF train = ',VIF)
RMSE, RSqr, VIF = calculate_accuracy(y_test,rfc_predict_test)
print('Random Forest RMSE test = ',RMSE, 'R-Square test = ',RSqr, 'VIF test = ',VIF)
Random Forest RMSE train = 1843.180644813415 R-Square train = 0.9773495387681166 VIF train = 44.14921134552318 Random Forest RMSE test = 4805.639146875192 R-Square test = 0.8302233541387685 VIF test = 5.890091625542885
print('Metrics of linear regression:')
RMSE, RSqr, VIF = calculate_accuracy(y_train,y_pred_train)
print('Linear RMSE train = ',RMSE, 'R-Square train = ',RSqr, 'VIF train = ',VIF)
RMSE, RSqr, VIF = calculate_accuracy(y_test,y_pred)
print('Linear RMSE test = ',RMSE, 'R-Square test = ',RSqr, 'VIF test = ',VIF)
print(' ')
print('Metrics of Decision Tree:')
RMSE, RSqr, VIF = calculate_accuracy(y_train,dtc_predict_train)
print('Decision Tree RMSE train = ',RMSE, 'R-Square train = ',RSqr, 'VIF train = ',VIF)
RMSE, RSqr, VIF = calculate_accuracy(y_test,dtc_predict_test)
print('Decision Tree RMSE test = ',RMSE, 'R-Square test = ',RSqr, 'VIF test = ',VIF)
print(' ')
print('Metrics of Random Forest:')
RMSE, RSqr, VIF = calculate_accuracy(y_train,rfc_predict_train)
print('Random Forest RMSE train = ',RMSE, 'R-Square train = ',RSqr, 'VIF train = ',VIF)
RMSE, RSqr, VIF = calculate_accuracy(y_test,rfc_predict_test)
print('Random Forest RMSE test = ',RMSE, 'R-Square test = ',RSqr, 'VIF test = ',VIF)
Metrics of linear regression: Linear RMSE train = 6200.042303094933 R-Square train = 0.7437907235802795 VIF train = 3.9030593036054095 Linear RMSE test = 6027.846610880442 R-Square test = 0.7328599025559049 VIF test = 3.743354178454142 Metrics of Decision Tree: Decision Tree RMSE train = 279.1173376029252 R-Square train = 0.9994805465077318 VIF train = 1925.1001579244978 Decision Tree RMSE test = 6479.645615986229 R-Square test = 0.6912970796606894 VIF test = 3.239360349752606 Metrics of Random Forest: Random Forest RMSE train = 1843.180644813415 R-Square train = 0.9773495387681166 VIF train = 44.14921134552318 Random Forest RMSE test = 4805.639146875192 R-Square test = 0.8302233541387685 VIF test = 5.890091625542885
import pickle
filename = 'streamlit_insurance_predictcharges.pkl'
pickle.dump(rfc, open(filename, 'wb'))
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(x_test, y_test)
print(result)
0.8301987341704054