In this notebook we are going to use a dataset that contains different informations about Ford models and their prices. We will explore and clean the dataset and then using sklearn library we will split the data into training and test and we will use two regressions models (linear and forest regression models) to make predictions about the car prices.
You can find my article in medium where I explain step by step my process.
https://medium.com/@ritaaggelou/train-test-split-in-python-a-step-by-step-guide-with-example-for-accurate-model-evaluation-53741204ff7d
# import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import linear_model
from sklearn.model_selection import train_test_split
df = pd.read_csv('/kaggle/input/ford-car-price-prediction/ford.csv')
df.head()
df.info()
# checking for Null Values
df.isna().sum()
df
# checking for negative values in price
(df['price'] < 0).sum()
# rearrange columns
df = df.iloc[:, [0, 1, 3, 4, 5, 6, 7, 8, 2]]
df
# checking for duplicates
df.duplicated().sum()
#dropping duplicate values
df = df.drop_duplicates().reset_index(drop=True)
df
#checking for miswriting in categorical values
list=['model','transmission','fuelType']
for i in list:
print("Unique values for",i,":", df[i].unique())
# separate features from target value
X=df.drop(['price'],axis=1)
y=df['price']
X
# encode categorical data
le=LabelEncoder()
list=['model','transmission','fuelType']
for i in list:
X[i]=le.fit_transform(X[i])
print("Unique categorical values for",i,":", df[i].unique())
print("Unique numerical values for",i,":", X[i].unique())
print("")
X
#Performing train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, shuffle=True, random_state=42)
print(X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)
# Instantiate the linear regression model
lin_model = linear_model.LinearRegression()
# Fit the linear regression model
lin_model.fit(X_train, y_train)
y_pred_l = lin_model.predict(X_test)
#predict the price of a car:
predictedprice = lin_model.predict([[5, 2017, 0, 15000, 3, 150, 57, 1.5]])
print(predictedprice.round(2))
# print the model equation
print("Intercept: ", lin_model.intercept_)
print("")
print("Coefficients:")
print(X.columns, lin_model.coef_)
# Predicted vs. Actual Plot
plt.scatter(y_test, y_pred_l)
plt.xlabel("True Prices")
plt.ylabel("Predicted Prices")
plt.title('Predicted vs. Actual Prices Plot')
plt.savefig('Predicted vs. Actual Prices Plot-Linear.png')
# Residual Plot
residuals = y_test - y_pred_l
plt.scatter(y_pred_l, residuals)
plt.xlabel('Predicted Prices')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.axhline(y=0, color='r', linestyle='--')
plt.savefig('Residual Plot-linear.png')
plt.show()
# Measure Score
print("Score for Linear Regression Model is:", lin_model.score(X_test, y_test).round(2))
# Instantiate the RandomForestClassifier
forest_model = RandomForestRegressor(n_estimators=100)
# Fit the RandomForestClassifier
forest_model.fit(X_train,y_train)
# prediction on Test Data
y_pred_f = forest_model.predict(X_test)
# Predicted vs. Actual Plot
plt.scatter(y_test, y_pred_f)
plt.xlabel("True Prices")
plt.ylabel("Predicted Prices")
plt.title('Predicted vs. Actual Prices Plot')
plt.savefig('Predicted vs. Actual Prices Plot-Forest.png')
# Residual Plot
residuals = y_test - y_pred_f
plt.scatter(y_pred_f, residuals)
plt.xlabel('Predicted Prices')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.axhline(y=0, color='r', linestyle='--')
plt.savefig('Residual Plot-Forest.png')
plt.show()
# Measure Score
print("Score for Forest Regression Model is:", forest_model.score(X_test, y_test).round(2))