# Load in some packages
import calendar
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# load dataset
clothing_df =pd.read_excel(r"C:\Users\jki\Downloads\clothing sales dataset.xlsx")  
clothing_df.head(6)


clothing_df.describe()


# lets  check for missing values
missing_values  = clothing_df.isna().sum()
print(missing_values)

Receipt no          0
Location            0
Customer name       0
Session id          0
Sale date           0
Status              0
Ordertype name      0
Token number        0
Product name        0
Product Category    0
Cost Per Unit       0
Selling Per Unit    0
Quantity            0
Cost Amount         0
Sales Amount        0
Profit              0
dtype: int64


# lets check the data types
clothing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4972 entries, 0 to 4971
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Receipt no        4972 non-null   object        
 1   Location          4972 non-null   object        
 2   Customer name     4972 non-null   object        
 3   Session id        4972 non-null   int64         
 4   Sale date         4972 non-null   datetime64[ns]
 5   Status            4972 non-null   object        
 6   Ordertype name    4972 non-null   object        
 7   Token number      4972 non-null   int64         
 8   Product name      4972 non-null   object        
 9   Product Category  4972 non-null   object        
 10  Cost Per Unit     4972 non-null   int64         
 11  Selling Per Unit  4972 non-null   int64         
 12  Quantity          4972 non-null   int64         
 13  Cost Amount       4972 non-null   int64         
 14  Sales Amount      4972 non-null   int64         
 15  Profit            4972 non-null   int64         
dtypes: datetime64[ns](1), int64(8), object(7)
memory usage: 621.6+ KB


# Let's create a new variable, Month, from 'Order Date':

clothing_df['Month'] = clothing_df['Sale date'].dt.month
clothing_df['Month'].describe()

count    4972.000000
mean        5.385358
std         3.208746
min         1.000000
25%         3.000000
50%         4.000000
75%         9.000000
max        12.000000
Name: Month, dtype: float64


clothing_df.head(5)


clothing_df['Year'] =clothing_df['Sale date'].dt.year
clothing_df['Year'].describe()

count    4972.000000
mean     2019.286203
std         0.452031
min      2019.000000
25%      2019.000000
50%      2019.000000
75%      2020.000000
max      2020.000000
Name: Year, dtype: float64


# Load in some packages
import calendar
import warnings
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations
from collections import Counter

warnings.filterwarnings("ignore")

# 1. What was the best month for sales? How much was earned that month?

# Replace NaN or inf values in the 'Month' column with a default value (e.g., 0)
clothing_df['Month'] = clothing_df['Month'].fillna(0).astype(int)

# Convert month numbers to abbreviated month names
clothing_df['Month Name'] = clothing_df['Month'].apply(lambda x: calendar.month_abbr[x])

# Group by month and calculate total sales for each month
sales_by_month = clothing_df.groupby('Month Name').sum()['Sales Amount']

# Find the best month for sales and the corresponding earnings
best_month = sales_by_month.idxmax()
earnings_for_best_month = sales_by_month.max()

print(f"The best month for sales was {best_month} with earnings of ${earnings_for_best_month:,.2f}")

The best month for sales was Mar with earnings of $1,142,600.00


# Plot the highest number of sales for each Location

Sales_by_location = clothing_df.groupby('Location').sum()['Sales Amount'] # Take the number of sales for each Location
unique_location =  clothing_df.groupby('Location').sum().index # Take all Location

# Plot the highest number of sales for each Location
plt.bar(unique_location, Sales_by_location)
plt.xticks(unique_location, rotation = 90, size = 8)
plt.ylabel('Sales Amount)')
plt.xlabel('Location')
plt.show()


# Plot the highest number of sales for each Customer

Sales_by_Customer_name = clothing_df.groupby('Customer name').sum()['Sales Amount'] # Take the number of sales for each Location
unique_Customer_name =  clothing_df.groupby('Customer name').sum().index # Take all Location

# Plot the highest number of sales for each Customer
plt.bar(unique_Customer_name,Sales_by_Customer_name )
plt.xticks(unique_Customer_name, rotation = 90, size = 8)
plt.ylabel('Sales Amount)')
plt.xlabel('Customer name')
plt.show()


# Plot the highest number of sales for each Order Type Name

Sales_by_Ordertype_name  = clothing_df.groupby('Ordertype name').sum()['Sales Amount'] # Take the number of sales for each Location
unique_Ordertype_name  =  clothing_df.groupby('Ordertype name').sum().index # Take all Location

# Plot the highest number of sales for each Order Type Name
plt.bar(unique_Ordertype_name,Sales_by_Ordertype_name)
plt.xticks(unique_Ordertype_name, rotation = 90, size = 8)
plt.ylabel('Sales Amount)')
plt.xlabel('Ordertype name')
plt.show()


# Plot the highest number of sales for each Product_name

Sales_by_Product_name = clothing_df.groupby('Product name').sum()['Sales Amount'] # Take the number of sales for each Product_name
unique_Product_name =  clothing_df.groupby('Product name').sum().index # Take all Product_name

# Plot the highest number of sales for each Product_name
plt.bar(unique_Product_name,Sales_by_Product_name)
plt.xticks(unique_Product_name, rotation = 90, size = 8)
plt.ylabel('Sales Amount)')
plt.xlabel('Product_name')
plt.show()


# Plot the highest number of sales for each Product Category

Sales_by_Product_Category = clothing_df.groupby('Product Category').sum()['Sales Amount'] # Take the number of sales for each Product Category
unique_Product_Category =  clothing_df.groupby('Product Category').sum().index # Take all Product Category

# Plot the highest number of sales for each Product Category
plt.bar(unique_Product_Category,Sales_by_Product_Category )
plt.xticks(unique_Product_Category, rotation = 90, size = 8)
plt.ylabel('Sales Amount)')
plt.xlabel('Product Category')
plt.show()


# Plot the highest number of sales for each Location

Sales_by_location = clothing_df.groupby('Location').sum()['Quantity'] # Take the number of sales for each Location
unique_location =  clothing_df.groupby('Location').sum().index # Take all Location

# Plot the highest number of sales for each Location
plt.bar(unique_location, Sales_by_location)
plt.xticks(unique_location, rotation = 90, size = 8)
plt.ylabel('Quantity Amount)')
plt.xlabel('Location')
plt.show()


# Plot the highest number of sales for each Customer

Sales_by_Customer_name = clothing_df.groupby('Customer name').sum()['Quantity'] # Take the number of sales for each Location
unique_Customer_name =  clothing_df.groupby('Customer name').sum().index # Take all Location

# Plot the highest number of sales for each Customer
plt.bar(unique_Customer_name,Sales_by_Customer_name )
plt.xticks(unique_Customer_name, rotation = 90, size = 8)
plt.ylabel('Quantity Amount)')
plt.xlabel('Customer name')
plt.show()


# Plot the highest number of sales for each Order Type

Sales_by_Ordertype_name  = clothing_df.groupby('Ordertype name').sum()['Quantity'] # Take the number of sales for each Location
unique_Ordertype_name  =  clothing_df.groupby('Ordertype name').sum().index # Take all Location

# Plot the highest number of sales for each Order Type
plt.bar(unique_Ordertype_name ,Sales_by_Ordertype_name)
plt.xticks(unique_Ordertype_name, rotation = 90, size = 8)
plt.ylabel('Quantity Amount)')
plt.xlabel('Ordertype_name')
plt.show()


# Plot the highest number of sales for each Product_name

Sales_by_Product_name = clothing_df.groupby('Product name').sum()['Quantity'] # Take the number of sales for each Product_name
unique_Product_name =  clothing_df.groupby('Product name').sum().index # Take all Product_name

# Plot the highest number of sales for each Product_name
plt.bar(unique_Product_name,Sales_by_Product_name)
plt.xticks(unique_Product_name, rotation = 90, size = 8)
plt.ylabel('Quantity Amount)')
plt.xlabel('Product_name')
plt.show()


# Plot the highest number of sales for each Product Category

Sales_by_Product_Category = clothing_df.groupby('Product Category').sum()['Quantity'] # Take the number of sales for each Product Category
unique_Product_Category =  clothing_df.groupby('Product Category').sum().index # Take all Product Category

# Plot the highest number of sales for each Product Category
plt.bar(unique_Product_Category,Sales_by_Product_Category )
plt.xticks(unique_Product_Category, rotation = 90, size = 8)
plt.ylabel('Quantity Amount)')
plt.xlabel('Product Category')
plt.show()

	Receipt no	Location	Customer name	Session id	Sale date	Status	Ordertype name	Product name	Product Category	Cost Per Unit	Selling Per Unit	Quantity	Cost Amount	Sales Amount	Profit
0	BL11	Chennai	Naga	28	2020-03-06	Ordered	On Shop	Jeans Levis	Casual Wear	900	1100	1	900	1100	200
1	BL11	Chennai	Mahesh	28	2020-03-06	Ordered	On Shop	Jeans Levis	Casual Wear	900	1100	1	900	1100	200
2	BL12	Hyderabad	Ganesh	28	2020-03-06	Ordered	On Shop	Jeans Levis	Casual Wear	900	1100	1	900	1100	200
3	BL12	Chennai	Kumar	28	2020-03-06	Ordered	On Shop	Jeans Levis	Casual Wear	900	1100	1	900	1100	200
4	BL12	Bangalore	Vijay	28	2020-03-06	Ordered	On Shop	Jeans Levis	Casual Wear	900	1100	1	900	1100	200
5	BL12	Bangalore	Sunil	28	2020-03-06	Ordered	On Shop	Jeans Levis	Casual Wear	900	1100	1	900	1100	200

	Session id	Token number	Cost Per Unit	Selling Per Unit	Quantity	Cost Amount	Sales Amount	Profit
count	4972.0	4972.0	4972.000000	4972.000000	4972.000000	4972.000000	4972.000000	4972.000000
mean	28.0	0.0	668.825422	868.825422	1.038013	697.687047	905.289622	207.602574
std	0.0	0.0	206.379808	206.379808	0.248842	293.852867	330.276940	49.768368
min	28.0	0.0	100.000000	300.000000	1.000000	100.000000	300.000000	200.000000
25%	28.0	0.0	400.000000	600.000000	1.000000	400.000000	600.000000	200.000000
50%	28.0	0.0	700.000000	900.000000	1.000000	700.000000	900.000000	200.000000
75%	28.0	0.0	900.000000	1100.000000	1.000000	900.000000	1100.000000	200.000000
max	28.0	0.0	1000.000000	1200.000000	3.000000	2700.000000	3300.000000	600.000000

	Receipt no	Location	Customer name	Session id	Sale date	Status	Ordertype name	Product name	Product Category	Cost Per Unit	Selling Per Unit	Quantity	Cost Amount	Sales Amount	Profit	Month
0	BL11	Chennai	Naga	28	2020-03-06	Ordered	On Shop	Jeans Levis	Casual Wear	900	1100	1	900	1100	200	3
1	BL11	Chennai	Mahesh	28	2020-03-06	Ordered	On Shop	Jeans Levis	Casual Wear	900	1100	1	900	1100	200	3
2	BL12	Hyderabad	Ganesh	28	2020-03-06	Ordered	On Shop	Jeans Levis	Casual Wear	900	1100	1	900	1100	200	3
3	BL12	Chennai	Kumar	28	2020-03-06	Ordered	On Shop	Jeans Levis	Casual Wear	900	1100	1	900	1100	200	3
4	BL12	Bangalore	Vijay	28	2020-03-06	Ordered	On Shop	Jeans Levis	Casual Wear	900	1100	1	900	1100	200	3

Clothing Sales Exploratory Data Analysis¶

1. What was the best month for sales? How much was earned that month?¶

2. What Location had the Highest number of Sales?¶

2. Which Customer bought the most?¶

3. Which Ordertype name had the most sales¶

4. Which Product had the most sales¶

5. Which Product Category had the highest sales¶

5. What Location sold the most units¶

6 Which Customer bought the most units?¶

7 Which Ordertype name sold the most units¶

8 Which Product sold the most units¶

9 Which Product Category had the most sales¶