# Installing the libraries with the specified version.
# !pip install numpy==1.25.2 pandas==1.5.3 matplotlib==3.7.1 seaborn==0.13.1 -q --user


# import libraries for data manipulation
import numpy as np
import pandas as pd

# import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns


# uncomment and run the following lines for Google Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Write your code here to read the data
# Reading FoodHub Master Data
data = pd.read_csv('/content/drive/MyDrive/content/foodhub_order.csv')


# Write your code here to view the first 5 rows
# Reading 5 top rows
data.head()


# Write your code here
print('Number of rows in the data:', data.shape[0])
print('Number of columns in the data:', data.shape[1])

Number of rows in the data: 1898
Number of columns in the data: 9


# Write your code here
data.info();

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   order_id               1898 non-null   int64  
 1   customer_id            1898 non-null   int64  
 2   restaurant_name        1898 non-null   object 
 3   cuisine_type           1898 non-null   object 
 4   cost_of_the_order      1898 non-null   float64
 5   day_of_the_week        1898 non-null   object 
 6   rating                 1898 non-null   object 
 7   food_preparation_time  1898 non-null   int64  
 8   delivery_time          1898 non-null   int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 133.6+ KB


# Write your code here
data.isnull().sum()


data[['customer_id','order_id']]=data[['customer_id','order_id']].astype(object)


# Write your code here
data.describe()


# Write the code here
data.groupby('rating')['rating'].count()


chr = sns.countplot(data = data, x = 'cuisine_type', stat='percent',
                    legend=True, order=data['cuisine_type'].value_counts().index);
plt.xlabel('Type of food');
plt.ylabel('Total orders(%)');
plt.xticks(rotation=90);
plt.title('Prefered food by Customers');
chr.bar_label(chr.containers[0], fmt='%.0f%%');
plt.show()


# Write the code here
sns.histplot(data = data, x = 'cost_of_the_order', kde=True);
plt.xlabel('Cost of the order');
plt.ylabel('Count');
plt.title('Distribution of Cost per order');
plt.show()


sns.boxplot(data = data, x = 'cost_of_the_order');
plt.xlabel('Cost of the order');
plt.ylabel('Count');
plt.title('Distribution of Cost per order');
plt.show()


chr = sns.histplot(data = data, x = 'day_of_the_week', stat='percent', legend=True);
plt.xlabel('Day of the week');
plt.ylabel('Orders by day');
plt.title('Distribution of Order per day');
chr.bar_label(chr.containers[0], fmt='%.1f%%');
plt.show()


chr = sns.histplot(data = data, x = 'rating', stat='percent', legend=True);
plt.xlabel('Rating');
plt.ylabel('Value distribution');
plt.title('Rating by orders');
chr.bar_label(chr.containers[0], fmt='%.1f%%');
plt.show()


#Replace Not given values' with nan
data['rating'] = data['rating'].replace(['Not given'],np.nan)
#Convert the data type to a float
data['rating'] = data['rating'].astype(float)


chr = sns.histplot(data = data, x = 'food_preparation_time', legend=True, kde=True);
plt.xlabel('Preparation time');
plt.ylabel('Total Orders');
plt.show()


chr = sns.histplot(data = data, x = 'delivery_time', legend=True, kde=True);
plt.xlabel('Delivery time');
plt.ylabel('Total Orders');
plt.show()


# Write the code here
data['restaurant_name'].value_counts().nlargest(5)

restaurant_name
Shake Shack                  219
The Meatball Shop            132
Blue Ribbon Sushi            119
Blue Ribbon Fried Chicken     96
Parm                          68
Name: count, dtype: int64


# Write the code here
data.groupby('day_of_the_week')['cuisine_type'].value_counts().nlargest(1)

day_of_the_week  cuisine_type
Weekend          American        415
Name: count, dtype: int64


# Write the code here
df_v = [data[data['cost_of_the_order']>20].shape[0]/data.shape[0]*100,
      data[data['cost_of_the_order']<21].shape[0]/data.shape[0]*100]
df_l = ['More than 20 dollars', 'Less than 20 dollars']
# Creating chart
plt.figure(figsize=(6,6))
sns.set_style("whitegrid")
sns.set_palette("pastel")
plt.pie(df_v, labels=df_l, autopct='%.0f%%')
# Add a title
plt.title("Percentage of orders that cost more than $20")

# Display the plot
plt.show()


# Write the code here
round(data['delivery_time'].mean(), 1)

24.2


# Write the code here
data['customer_id'].value_counts().nlargest(3)

customer_id
52832    13
47440    10
83287     9
Name: count, dtype: int64


#Convert the data type to a float
data['order_id'] = data['order_id'].astype(str)
data['customer_id'] = data['customer_id'].astype(str)


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   order_id               1898 non-null   object 
 1   customer_id            1898 non-null   object 
 2   restaurant_name        1898 non-null   object 
 3   cuisine_type           1898 non-null   object 
 4   cost_of_the_order      1898 non-null   float64
 5   day_of_the_week        1898 non-null   object 
 6   rating                 1162 non-null   float64
 7   food_preparation_time  1898 non-null   int64  
 8   delivery_time          1898 non-null   int64  
 9   total_delivery_time    1898 non-null   int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 148.4+ KB


# Write the code here
plt.figure(figsize=(7,3))
sns.heatmap(data=data[['cost_of_the_order', 'rating', 'food_preparation_time', 'delivery_time']].corr(), annot=True, cmap='Spectral', vmin=-1, vmax=1)
plt.show()


sns.boxplot(data = data, x = 'rating', y = 'cost_of_the_order');
plt.xlabel('Rating');
plt.ylabel('Cost of the order');
plt.show()


sns.boxplot(data = data, x = 'cuisine_type', y = 'cost_of_the_order', hue='cuisine_type');
plt.xlabel('Cuisine Type');
plt.ylabel('Cost of the order');
plt.xticks(rotation=90);
plt.show()


sns.boxplot(data = data, x = 'day_of_the_week', y = 'food_preparation_time', hue='day_of_the_week');
plt.xlabel('Day of the week');
plt.ylabel('Preparation time');
plt.show()


# Write the code here
df = data.groupby('restaurant_name')['rating'].agg(['mean','count'])


df = df[(df['count']>50) & (df['mean']>4)]
df


fd_20 = data[data['cost_of_the_order']>20]
fd_5 = data[(data['cost_of_the_order']>5) & (data['cost_of_the_order']<=20)]


fd_20_rv = fd_20['cost_of_the_order'].sum()*0.25
fd_5_rv = fd_5['cost_of_the_order'].sum()*0.15


print(fd_20_rv + fd_5_rv)

6166.303


data['total_delivery_time'] = data['food_preparation_time'] + data['delivery_time']


round(data[data['total_delivery_time'] > 60].shape[0]/data.shape[0]*100, 2)

10.54


# Write the code here
data.groupby('day_of_the_week')['delivery_time'].mean().round(2)

day_of_the_week
Weekday    28.34
Weekend    22.47
Name: delivery_time, dtype: float64


plt.figure(figsize=(6,6))
sns.boxplot(data = data, x = 'day_of_the_week', y = 'delivery_time');
plt.xlabel('Day of the week');
plt.ylabel('Delivery time');
plt.title('Delivery time by day of the week');
plt.show()

	order_id	customer_id	cost_of_the_order	food_preparation_time	delivery_time
count	1.898000e+03	1898.000000	1898.000000	1898.000000	1898.000000
mean	1.477496e+06	171168.478398	16.498851	27.371970	24.161749
std	5.480497e+02	113698.139743	7.483812	4.632481	4.972637
min	1.476547e+06	1311.000000	4.470000	20.000000	15.000000
25%	1.477021e+06	77787.750000	12.080000	23.000000	20.000000
50%	1.477496e+06	128600.000000	14.140000	27.000000	25.000000
75%	1.477970e+06	270525.000000	22.297500	31.000000	28.000000
max	1.478444e+06	405334.000000	35.410000	35.000000	33.000000

	rating
rating
3	188
4	386
5	588
Not given	736

	mean	count
restaurant_name
'wichcraft	5.000000	1
12 Chairs	4.500000	2
5 Napkin Burger	4.000000	2
67 Burger	5.000000	1
Alidoro	NaN	0
...	...	...
Zero Otto Nove	4.000000	1
brgr	3.000000	1
da Umberto	5.000000	1
ilili Restaurant	4.153846	13
indikitch	4.500000	2

	order_id	customer_id	restaurant_name	cuisine_type	cost_of_the_order	day_of_the_week	rating	food_preparation_time	delivery_time
0	1477147	337525	Hangawi	Korean	30.75	Weekend	Not given	25	20
1	1477685	358141	Blue Ribbon Sushi Izakaya	Japanese	12.08	Weekend	Not given	25	23
2	1477070	66393	Cafe Habana	Mexican	12.23	Weekday	5	23	28
3	1477334	106968	Blue Ribbon Fried Chicken	American	29.20	Weekend	3	25	15
4	1478249	76942	Dirty Bird to Go	American	11.59	Weekday	4	25	24

	mean	count
restaurant_name
Blue Ribbon Fried Chicken	4.328125	64
Blue Ribbon Sushi	4.219178	73
Shake Shack	4.278195	133
The Meatball Shop	4.511905	84

FoodHub Data Analysis¶

Context¶

Objective¶

Data Description¶

Data Dictionary¶

Let us start by importing the required libraries¶

Understanding the structure of the data¶

Question 1: How many rows and columns are present in the data?¶

Observations:¶

Question 2: What are the datatypes of the different columns in the dataset?¶

Observations:¶

Question 3: Are there any missing values in the data? If yes, treat them using an appropriate method.¶

Observations:¶

Question 4: Check the statistical summary of the data. What is the minimum, average, and maximum time it takes for food to be prepared once an order is placed? [2 marks]¶

Observations:¶

Question 5: How many orders are not rated?¶

Observations:¶

Exploratory Data Analysis (EDA)¶

Univariate Analysis¶

Question 6: Explore all the variables and provide observations on their distributions. (Generally, histograms, boxplots, countplots, etc. are used for univariate exploration.) [9 marks]¶

Question 7: Which are the top 5 restaurants in terms of the number of orders received?¶

Observations:¶

Question 8: Which is the most popular cuisine on weekends?¶

Observations:¶

Question 9: What percentage of the orders cost more than 20 dollars? [2 marks]¶

Observations:¶

Question 10: What is the mean order delivery time?¶

Observations:¶

Question 11: The company has decided to give 20% discount vouchers to the top 3 most frequent customers. Find the IDs of these customers and the number of orders they placed.¶

Observations:¶

Multivariate Analysis¶

Question 12: Perform a multivariate analysis to explore relationships between the important variables in the dataset. (It is a good idea to explore relations between numerical variables as well as relations between numerical and categorical variables) [10 marks]¶

Observations:¶

Observations:¶

Question 14: The company charges the restaurant 25% on the orders having cost greater than 20 dollars and 15% on the orders having cost greater than 5 dollars. Find the net revenue generated by the company across all orders. [3 marks]¶

Observations:¶

Question 15: The company wants to analyze the total time required to deliver the food. What percentage of orders take more than 60 minutes to get delivered from the time the order is placed? (The food has to be prepared and then delivered.) [2 marks]¶

Observations:¶

Question 16: The company wants to analyze the delivery time of the orders on weekdays and weekends. How does the mean delivery time vary during weekdays and weekends? [2 marks]¶

Observations:¶

Conclusion and Recommendations¶

Question 17: What are your conclusions from the analysis? What recommendations would you like to share to help improve the business? (You can use cuisine type and feedback ratings to drive your business recommendations.) [6 marks]¶

Conclusions:¶

Recommendations:¶