# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option("display.max_columns", 40)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
The dataset that we are going to use in this notebook, contains informations about customer purchase orders of an online shop. Our purpose is to clean the data, make few basic visualizations and create a dashboard with customer behavior.
So, first lets explain what data our dataset has.
Description of the columns:
df = pd.read_csv('/kaggle/input/online-shopping-dataset/file.csv')
df.head()
df.info()
There are a few columns with null values. We are going to drop all the empty values and some columns that we are not going to use in our analysis.
df.dropna(inplace=True)
df = df.drop(columns=['Unnamed: 0','Product_SKU', 'Product_Description'])
df.head()
Let's see if there are any duplicates or miswrightings in our data.
df2 = pd.DataFrame(df, columns=['Gender', 'Location', 'Product_Category', 'Coupon_Status', 'Coupon_Code', 'Tenure_Months'])
for col in df2:
print (col, df2[col].unique())
df.nunique()
pd.isnull(df).sum()
We can add two columns which will help us to better vizualize our results.
df['Total_Price']=df['Avg_Price']*df['Quantity']+df['Delivery_Charges']
df['Total_Spend']=df['Offline_Spend']+df['Online_Spend']
df.head()
The date column is object, so we are going to make datetime.
df["Date"] = pd.to_datetime(df["Date"])
df.info()
df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df.head()
df.to_csv('shopping_data.csv', index=False)
import seaborn as sb
import matplotlib.pyplot as plt
val=df.Gender.value_counts()
plt.pie(val,labels=['Female','Male'],autopct="%1.1f%%",shadow=True,explode=(0.1,0),)
plt.axis('equal')
plt.title('Number of Male - Female Customer')
sb.set(style='white')
male=df[['Total_Price','Gender']].groupby('Gender').get_group('M')['Total_Price'].sum()
female=df[['Total_Price','Gender']].groupby('Gender').get_group('F')['Total_Price'].sum()
val=[male,female]
plt.pie(val,labels=['Female','Male'],autopct="%1.1f%%",shadow=True,explode=(0.1,0),)
plt.axis('equal')
plt.title('Price Spent by Male / Female Customer')
sb.set(style='white')
df.Coupon_Code.value_counts()
We have many coupons and with different discount (10%, 20% and 30%) but how many of them are actually used? We can see the big picture for all of them in the bar plot below. Only 34% of them was used.
fig=plt.figure()
axis=fig.add_axes([1,1,1,1])
nd1=df.Coupon_Status
sb.countplot(data=nd1,x=nd1,ax=axis)
for i in axis.patches:
axis.annotate(i.get_height(),(i.get_x(),i.get_height()),va='bottom',ha='left')
plt.ylabel('Frequency')
plt.title('Coupon_status')
We can see a more detailed view of how the coupons used below.
list = ['30','20','10']
for i in list:
filtered_df = df[df.Coupon_Code.str.contains(i)]
import seaborn as sns
p = sns.countplot(data=filtered_df, x='Coupon_Code', hue='Coupon_Status')
p.legend(title='Coupon_Status', bbox_to_anchor=(1, 1), loc='upper left')
#p.set(xlabel='How many customers used the ' +i '% Coupons')
plt.title('Coupon Status for ' + i +'% Discount')
# change rotation of x-labels
plt.xticks(rotation=45, ha='right')
#adjust size
plt.figure(figsize=(12, 10))
plt.show()
Let's see for different months how the average price from customers performs (max, min, mean and outliers.)
#boxplot chart
df.boxplot(by ='Month', column =['Avg_Price'], grid = False)
We can observe the same results also in the line graph below.
#line chart
df3=df.groupby(pd.PeriodIndex(df['Date'], freq="M"))['Total_Price'].mean()
df3.plot.line(x='Date', y='Avg_Price', title = 'Mean Value of Total Prices per Month')
It would also be interesting to see how many Female or Male chose the different kind of products.
df2 = df.groupby(['Product_Category', 'Gender'])['Product_Category'].count().unstack('Gender').fillna(0)
#stacked bar chart
ax=df2[['F','M']].plot(kind='bar', stacked=True, title='Number of Products purchased by Gender')
#add annotations
for container in ax.containers:
ax.bar_label(container, size=8)
Now, it would be interesting to see how the most purchased products sell over different months.
df_plot= df.groupby(['Month','Product_Category'])['Product_Category'].count().unstack('Product_Category').fillna(0)
df_plot.head()
areaplot = df_plot[['Apparel','Nest-USA','Office']].plot.area( figsize = (10,8), title='Number of top 3 Products purchased by Month', stacked= False)
We have different cities from USA (Chicago, California, New York, New Jersey, Washington DC) but where from are the most purchases?
sb.histplot(df.Location)
plt.ylabel('Frequency')
plt.xlabel('Cities')
plt.xticks(rotation=45)
plt.title('Location Frequencies')