# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
The dataset has information about two different hotel types, City Hotel and Resort Hotel. In this notebook we are going to clean our data and make a basis analysis.
The dataset that we are going to use contains the following columns:
adr : average daily rate
adults : number of adults
agent : ID of the travel agency that made the booking
arrival_date_day_of_month : Day of the month of the arrival date
arrival_date_month : Month of arrival date with 12 categories: “January” to “December”
arrival_date_week_number : Week number of the arrival date
arrival_date_year : Year of arrival date
assigned_room_type : Code for the type of room assigned to the booking. Sometimes the assigned room type differs from the reserved room type due to hotel operation reasons (e.g. overbooking) or by customer request. Code is presented instead of designation for anonymity reasons.
babies : Number of babies
booking_changes : Number of changes/amendments made to the booking from the moment the booking was entered on the PMS until the moment of check-in or cancellation.
children : Number of children.
company : ID of the company/entity that made the booking or responsible for paying the booking. ID is presented instead of designation for anonymity reasons.
country : Country of origin. Categories are represented in the ISO 3155–3:2013 format
customer_type : Type of booking, assuming one of four categories :
days_in_waiting_list : Number of days the booking was in the waiting list before it was confirmed to the customer
deposit_type : Indication on if the customer made a deposit to guarantee the booking. This variable can assume three categories:
distribution_channel : Booking distribution channel. The term “TA” means “Travel Agents” and “TO” means “Tour Operators”.
is_canceled : Value indicating if the booking was canceled (1) or not (0).
is_repeated_quests : Value indicating if the booking name was from a repeated guest (1) or not (0).
lead_time : Number of days that elapsed between the entering date of the booking into the PMS and the arrival date.
market_segment : Market segment designation. In categories, the term “TA” means “Travel Agents” and “TO” means “Tour Operators”.
meal : Type of meal booked. Categories are presented in standard hospitality meal packages:
previous_bookings_not_canceled : Number of previous bookings not cancelled by the customer prior to the current booking.
previous_cancellations : Number of previous bookings that were cancelled by the customer prior to the current booking.
required_car_parking_spaces : Number of car parking spaces required by the customer.
reservation_status : Reservation last status, assuming one of three categories:
reservation_status_date : Date at which the last status was set. This variable can be used in conjunction with the ReservationStatus to understand when was the booking canceled or when did the customer checked-out of the hotel.
reserved_room_type : Code of room type reserved. Code is presented instead of designation for anonymity reasons.
stays_in_weekend_nights : Number of weekend nights (Saturday or Sunday) the guest stayed or booked to stay at the hotel.
stays_in_week_nights : Number of week nights (Monday to Friday) the guest stayed or booked to stay at the hotel.
total_of_special_requests : Number of special requests made by the customer (e.g. twin bed or high floor).
hotel_bookings = pd.read_csv("/kaggle/input/hotel-booking-demand/hotel_bookings.csv")
hotel_bookings.head()
hotel_bookings.info()
We are going to drop company feature, because most of the values in it are null and the arrival_date_week_number because we will not use it.
hotel_bookings.drop('company',inplace=True,axis=1)
hotel_bookings.drop('arrival_date_week_number',inplace=True,axis=1)
There are few columns that have missing values (children, country, agent, company).
hotel_bookings.isnull().sum()
We are going to fill the null values:
# replace null values with median for children column
hotel_bookings['children'] = hotel_bookings['children'].fillna(hotel_bookings['children'].median())
# replace null values with median for agent column
hotel_bookings['agent'] = hotel_bookings['agent'].fillna(hotel_bookings['agent'].median())
# find which country is more frequent in our data
print(hotel_bookings['country'].mode())
# replace null values with the most frequent value in the country column
hotel_bookings['country'].fillna(hotel_bookings['country'].mode()[0], inplace=True)
Let's check our results.
hotel_bookings.isnull().sum()
Now that we are done with the null values we can go further and ask questions in our data.
hotel_bookings['arrival_date_year'] = hotel_bookings['arrival_date_year'].apply(str)
There might be miswritings in our data. Therefore we will check the object columns to see if there are errors. Like, in the column arrival_date_month might be typed "Aug" instead of "August", or in column arrival_date_year might be typed "'15" instead of "2015" etc. But we can see that there are not miswritings here.
#find the unique values on our data
df = pd.DataFrame(hotel_bookings, columns=["hotel", "arrival_date_month","arrival_date_year", "country", "market_segment", "distribution_channel", "reserved_room_type", "assigned_room_type", "deposit_type", "customer_type", "reservation_status"])
for col in df:
print (col, df[col].unique())
We have three columns (children, adults, babies) that can't be zero all the them at the same row. But we can see that there are 180 rows where all of them are 0.
# filter data
filter = (hotel_bookings.children == 0) & (hotel_bookings.adults == 0) & (hotel_bookings.babies == 0)
hotel_bookings[filter]
#drop rows with logical errors
indexguest = hotel_bookings[(hotel_bookings['children'] == 0) & (hotel_bookings['adults'] == 0) & (hotel_bookings['babies'] == 0)].index
hotel_bookings.drop(indexguest , inplace=True)
#reset index to our new dataset
hotel_bookings.reset_index(drop=True, inplace=True)
hotel_bookings.head()
Let's check if the rows with the wrong data are dropped. Now our dataset from 119390 rows has 119210 (exactly 180 rows less)
hotel_bookings.info()
hotel_bookings.to_csv('/kaggle/working/hotel_bookings_clean.csv',index=False)
Let's make a dataset which will include only the people that did not cancel their room reservation. And then we will analyze the data for those that did cancelled.
not_cancelled = hotel_bookings[hotel_bookings['is_canceled'] == 0]
not_cancelled.head()
Let's ckeck if we have the right dataset. If the there are any values for columns stays_in_weekend_nights and stays_in_week_night have same value 0 or 1, then something is wrong and we have false data. Maybe something wrong with the data entry. Our outcome is False that means we are good to go.
not_cancelled['stays_in_weekend_nights'].equals(not_cancelled['stays_in_week_nights'])
The people that did not cancel their reservation is 63% of our data
not_cancelled.info()
We can observe from the following table that:
not_cancelled.describe()
not_cancelled.describe(include=['O'])
Let's observe the above results in a more visual way
import seaborn as sns
import matplotlib.pyplot as plt
val=not_cancelled.hotel.value_counts()
plt.pie(val,labels=['City Hotel','Resort Hotel'],autopct="%1.1f%%",shadow=True,explode=(0.1,0),textprops={'fontsize': 12})
plt.axis('equal')
plt.title('Hotel Type Preference', fontsize = 15)
# We can simply use a countplot as we sre visualising categorical data
plt.figure(figsize=(20,5))
# data we will use in a list
l1 = ['hotel','arrival_date_month']
sns.set(font_scale=1.5)
# plotting
sns.countplot(data = not_cancelled[l1],x= "arrival_date_month",hue="hotel",order=["January","February","March","April","May","June",
"July","August","September","October","November","December"]).set_title(
'Number of Visitors Each Month')
plt.xlabel('Month')
plt.ylabel('Count')
# We can simply use a countplot as we are visualising categorical data
plt.figure(figsize=(20,10))
# data we will use in a list
l2 = ['hotel','arrival_date_year']
sns.set(font_scale=2)
# plotting
sns.countplot(data = not_cancelled[l2],x= "arrival_date_year",hue="hotel",order=["2015","2016","2017"]).set_title(
'Number of Visitors Each Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.figure(figsize = (7, 5))
sns.histplot(not_cancelled["lead_time"], bins = 20)
Let's check from which country we the guests that checked-in. We will limit our data to the countries that had at least 200 guests, in order to plot the countries with the most guests.
country_counts = not_cancelled['country'].value_counts().reset_index()
country_counts.columns = ['country', 'No of guests']
country_counts
indexguest2 = country_counts[(country_counts['No of guests'] < 200)].index
country_counts.drop(indexguest2 , inplace=True)
country_counts
country_counts.plot(kind='bar',x='country',y='No of guests',figsize=(15,8))
cancelled = hotel_bookings[hotel_bookings['is_canceled'] == 1]
cancelled.head()
Checking if our dataset is right. By checking the column "reservation_status" we can see that it has only values "Canceled" and "No-show". Therefore, our dataset is right, having only the cancelled data.
cancelled['reservation_status'].unique()
cancelled['stays_in_weekend_nights'].equals(cancelled['stays_in_week_nights'])
cancelled.info()
We can observe from the following table that:
cancelled.describe()
cancelled.describe(include=['O'])
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(10,10)) #ax1,ax2 refer to your two pies
# 1,2 denotes 1 row, 2 columns - if you want to stack vertically, it would be 2,1
ax1.pie(not_cancelled['hotel'].value_counts(),
labels=['City Hotel', 'Resort Hotel'],autopct = '%1.1f%%') #plot first pie
ax1.set_title('Guests vs Hotel Type')
ax2.pie(cancelled['hotel'].value_counts(),
labels=['City Hotel', 'Resort Hotel'],autopct = '%1.1f%%') #plot second pie
ax2.set_title('Cancellation vs Hotel Type')
# make two subplots side by side which share the same y-axis
fig, ax =plt.subplots(1,2,figsize=(10,7), sharey=True)
l2 = ['hotel','arrival_date_year']
sns.countplot(not_cancelled[l2],x= "arrival_date_year",hue="hotel",order=["2015","2016","2017"], ax=ax[0]).set_title(
'Number of Visitors Each Year', fontsize=18)
sns.countplot(cancelled[l2],x= "arrival_date_year",hue="hotel",order=["2015","2016","2017"], ax=ax[1]).set_title(
'Number of Cancellations Each Year', fontsize=18)
#add space
fig.tight_layout()
leg = ax[0].legend(fontsize=12)
leg = ax[1].legend(fontsize=12)
fig.show()
# make two subplots side by side which share the same y-axis
fig, ax =plt.subplots(1,2, figsize=(12,7), sharey=True)
l1 = ['hotel','arrival_date_month']
sns.countplot(not_cancelled[l1],x= "arrival_date_month",hue="hotel",order=["January","February","March","April","May","June",
"July","August","September","October","November","December"], ax=ax[0]).set_title(
'Number of Visitors Each Year')
sns.countplot(cancelled[l1],x= "arrival_date_month",hue="hotel",order=["January","February","March","April","May","June",
"July","August","September","October","November","December"], ax=ax[1]).set_title(
'Number of Cancellations Each Year')
# change rotation of x-labels
plt.xticks(rotation=45, ha='right')
leg = ax[0].legend(fontsize=12)
leg = ax[1].legend(fontsize=12)
#add space
fig.tight_layout()
fig.show()