## Importing libraries for the work
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot
import seaborn as sns
shipment_data = pd.read_csv(r"C:\Users\User Pc\Desktop\Datasets\Shipment data Python\Shipment details data.csv",
encoding='cp1252')
shipment_data.head(2)
| Type | Days for shipping (real) | Days for shipment (scheduled) | Benefit per order | Sales per customer | Delivery Status | Late_delivery_risk | Category Id | Category Name | Customer City | ... | Order Zipcode | Product Card Id | Product Category Id | Product Description | Product Image | Product Name | Product Price | Product Status | shipping date (DateOrders) | Shipping Mode | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DEBIT | 3 | 4 | 91.250000 | 314.640015 | Advance shipping | 0 | 73 | Sporting Goods | Caguas | ... | NaN | 1360 | 73 | NaN | http://images.acmesports.sports/Smart+watch | Smart watch | 327.75 | 0 | 2/3/2018 22:56 | Standard Class |
| 1 | TRANSFER | 5 | 4 | -249.089996 | 311.359985 | Late delivery | 1 | 73 | Sporting Goods | Caguas | ... | NaN | 1360 | 73 | NaN | http://images.acmesports.sports/Smart+watch | Smart watch | 327.75 | 0 | 1/18/2018 12:27 | Standard Class |
2 rows × 53 columns
## Identifying the number of columns and columns names
print(f"The no of columns in the dataset is {len(shipment_data.columns)}"),shipment_data.columns
The no of columns in the dataset is 53
(None,
Index(['Type', 'Days for shipping (real)', 'Days for shipment (scheduled)',
'Benefit per order', 'Sales per customer', 'Delivery Status',
'Late_delivery_risk', 'Category Id', 'Category Name', 'Customer City',
'Customer Country', 'Customer Email', 'Customer Fname', 'Customer Id',
'Customer Lname', 'Customer Password', 'Customer Segment',
'Customer State', 'Customer Street', 'Customer Zipcode',
'Department Id', 'Department Name', 'Latitude', 'Longitude', 'Market',
'Order City', 'Order Country', 'Order Customer Id',
'order date (DateOrders)', 'Order Id', 'Order Item Cardprod Id',
'Order Item Discount', 'Order Item Discount Rate', 'Order Item Id',
'Order Item Product Price', 'Order Item Profit Ratio',
'Order Item Quantity', 'Sales', 'Order Item Total',
'Order Profit Per Order', 'Order Region', 'Order State', 'Order Status',
'Order Zipcode', 'Product Card Id', 'Product Category Id',
'Product Description', 'Product Image', 'Product Name', 'Product Price',
'Product Status', 'shipping date (DateOrders)', 'Shipping Mode'],
dtype='object'))
shipment_data.drop(columns=['Late_delivery_risk', 'Category Id', 'Customer Email', 'Customer Fname', 'Customer City',
'Customer Lname', 'Customer Password', 'Customer State', 'Customer Street', 'Customer Zipcode',
'Department Id', 'Latitude', 'Longitude', 'Order Item Cardprod Id', 'Order Item Profit Ratio',
'Sales', 'Order Item Total', 'Order Profit Per Order',
'Order Status', 'Product Price', 'Product Status'], inplace=True)
## Dropping more unnecessary columns in the dataset
shipment_data.drop(columns=['Customer Segment', 'Order Item Product Price', 'Order State',
'Order Region', 'Order Zipcode', 'Product Card Id', 'Product Category Id',
'Product Description', 'Product Image'], inplace=True)
shipment_data.head(2)
| Type | Days for shipping (real) | Days for shipment (scheduled) | Benefit per order | Sales per customer | Delivery Status | Category Name | Customer Country | Customer Id | Department Name | ... | Order Customer Id | order date (DateOrders) | Order Id | Order Item Discount | Order Item Discount Rate | Order Item Id | Order Item Quantity | Product Name | shipping date (DateOrders) | Shipping Mode | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DEBIT | 3 | 4 | 91.250000 | 314.640015 | Advance shipping | Sporting Goods | Puerto Rico | 20755 | Fitness | ... | 20755 | 1/31/2018 22:56 | 77202 | 13.110000 | 0.04 | 180517 | 1 | Smart watch | 2/3/2018 22:56 | Standard Class |
| 1 | TRANSFER | 5 | 4 | -249.089996 | 311.359985 | Late delivery | Sporting Goods | Puerto Rico | 19492 | Fitness | ... | 19492 | 1/13/2018 12:27 | 75939 | 16.389999 | 0.05 | 179254 | 1 | Smart watch | 1/18/2018 12:27 | Standard Class |
2 rows × 23 columns
shipment_data.columns
Index(['Type', 'Days for shipping (real)', 'Days for shipment (scheduled)',
'Benefit per order', 'Sales per customer', 'Delivery Status',
'Category Name', 'Customer Country', 'Customer Id', 'Department Name',
'Market', 'Order City', 'Order Country', 'Order Customer Id',
'order date (DateOrders)', 'Order Id', 'Order Item Discount',
'Order Item Discount Rate', 'Order Item Id', 'Order Item Quantity',
'Product Name', 'shipping date (DateOrders)', 'Shipping Mode'],
dtype='object')
shipment_data.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Days for shipping (real) | 180519.0 | 3.497654 | 1.623722 | 0.00000 | 2.000000 | 3.000000 | 5.000000 | 6.000000 |
| Days for shipment (scheduled) | 180519.0 | 2.931847 | 1.374449 | 0.00000 | 2.000000 | 4.000000 | 4.000000 | 4.000000 |
| Benefit per order | 180519.0 | 21.974989 | 104.433526 | -4274.97998 | 7.000000 | 31.520000 | 64.800003 | 911.799988 |
| Sales per customer | 180519.0 | 183.107609 | 120.043670 | 7.49000 | 104.379997 | 163.990005 | 247.399994 | 1939.989990 |
| Customer Id | 180519.0 | 6691.379495 | 4162.918106 | 1.00000 | 3258.500000 | 6457.000000 | 9779.000000 | 20757.000000 |
| Order Customer Id | 180519.0 | 6691.379495 | 4162.918106 | 1.00000 | 3258.500000 | 6457.000000 | 9779.000000 | 20757.000000 |
| Order Id | 180519.0 | 36221.894903 | 21045.379569 | 1.00000 | 18057.000000 | 36140.000000 | 54144.000000 | 77204.000000 |
| Order Item Discount | 180519.0 | 20.664741 | 21.800901 | 0.00000 | 5.400000 | 14.000000 | 29.990000 | 500.000000 |
| Order Item Discount Rate | 180519.0 | 0.101668 | 0.070415 | 0.00000 | 0.040000 | 0.100000 | 0.160000 | 0.250000 |
| Order Item Id | 180519.0 | 90260.000000 | 52111.490959 | 1.00000 | 45130.500000 | 90260.000000 | 135389.500000 | 180519.000000 |
| Order Item Quantity | 180519.0 | 2.127638 | 1.453451 | 1.00000 | 1.000000 | 1.000000 | 3.000000 | 5.000000 |
## Vital info about Real shipment time(day) vs scheduled shipment time(day)
## By comparing their respective STD, Mean and Max.
real_shipment_days = shipment_data['Days for shipping (real)'].value_counts()
real_shipment_days.plot(kind='pie', title='No of times each shipment duration(days) reoccurred', figsize=(6,6))
<Axes: title={'center': 'No of times each shipment duration(days) reoccurred'}, ylabel='count'>
shipment_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 180519 entries, 0 to 180518 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Type 180519 non-null object 1 Days for shipping (real) 180519 non-null int64 2 Days for shipment (scheduled) 180519 non-null int64 3 Benefit per order 180519 non-null float64 4 Sales per customer 180519 non-null float64 5 Delivery Status 180519 non-null object 6 Category Name 180519 non-null object 7 Customer Country 180519 non-null object 8 Customer Id 180519 non-null int64 9 Department Name 180519 non-null object 10 Market 180519 non-null object 11 Order City 180519 non-null object 12 Order Country 180519 non-null object 13 Order Customer Id 180519 non-null int64 14 order date (DateOrders) 180519 non-null object 15 Order Id 180519 non-null int64 16 Order Item Discount 180519 non-null float64 17 Order Item Discount Rate 180519 non-null float64 18 Order Item Id 180519 non-null int64 19 Order Item Quantity 180519 non-null int64 20 Product Name 180519 non-null object 21 shipping date (DateOrders) 180519 non-null object 22 Shipping Mode 180519 non-null object dtypes: float64(4), int64(7), object(12) memory usage: 31.7+ MB
shipment_data[ 'order date (DateOrders)'] = pd.to_datetime(shipment_data[ 'order date (DateOrders)'])
## Coverting the Orders date data to Year
shipment_data['Year'] = shipment_data[ 'order date (DateOrders)'].dt.year
customers_orders_per_year = shipment_data.groupby('Year')[['Customer Id', 'Order Id']].nunique()
customers_orders_per_year
| Customer Id | Order Id | |
|---|---|---|
| Year | ||
| 2015 | 10131 | 20904 |
| 2016 | 10222 | 20859 |
| 2017 | 15156 | 21866 |
| 2018 | 2123 | 2123 |
customers_orders_per_year.plot(kind='barh', rot=0, grid=True, stacked=True, color=['gold', 'red'])
## The viz shows the company had a major drop off in customers in the year 2018
<Axes: ylabel='Year'>
sales_qty_profit_per_year = shipment_data.groupby('Year')[['Benefit per order',
'Sales per customer', 'Order Item Quantity']].sum()
sales_qty_profit_per_year
| Benefit per order | Sales per customer | Order Item Quantity | |
|---|---|---|---|
| Year | |||
| 2015 | 1.318857e+06 | 1.108954e+07 | 138480 |
| 2016 | 1.310119e+06 | 1.105600e+07 | 137352 |
| 2017 | 1.304085e+06 | 1.061091e+07 | 106124 |
| 2018 | 3.384189e+04 | 2.979521e+05 | 2123 |
sales_qty_profit_per_year.plot.bar(rot=0, grid=True, color=['darkred', 'darkblue', 'darkgreen'])
## The viz shows;
## 1.A relatively similarity in sales, order Qty, and profit per order from 2015 to 2017
## (though sales in 2017 was a slight drop in comaprison to 2015 and 2016)
## 3. the company had a major drop off in sales,order Qty and profit per order in the year 2018
<Axes: xlabel='Year'>
products_by_orders = shipment_data.groupby('Product Name',
as_index=False)[['Order Id']].nunique().sort_values(by='Order Id', ascending=False)
products_by_orders.head(3)
| Product Name | Order Id | |
|---|---|---|
| 71 | Perfect Fitness Perfect Rip Deck | 20359 |
| 56 | Nike Men's CJ Elite 2 TD Football Cleat | 18783 |
| 59 | Nike Men's Dri-FIT Victory Golf Polo | 17869 |
products_by_orders[:10].plot.barh(x='Product Name', y='Order Id', color= 'indigo')
<Axes: ylabel='Product Name'>
sales_qty_profit_by_product = shipment_data.groupby('Product Name', as_index=False)[['Benefit per order',
'Sales per customer']].sum().sort_values(by='Sales per customer', ascending=False)
sales_qty_profit_by_product.head(5)
| Product Name | Benefit per order | Sales per customer | |
|---|---|---|---|
| 24 | Field & Stream Sportsman 16 Gun Fire Safe | 756220.767190 | 6.226935e+06 |
| 71 | Perfect Fitness Perfect Rip Deck | 493828.299782 | 3.973180e+06 |
| 21 | Diamondback Women's Serene Classic Comfort Bi | 427455.568106 | 3.700784e+06 |
| 61 | Nike Men's Free 5.0+ Running Shoe | 379915.818503 | 3.295693e+06 |
| 59 | Nike Men's Dri-FIT Victory Golf Polo | 350421.029567 | 2.828708e+06 |
sales_qty_profit_by_product[:5].plot(kind='barh', x='Product Name', rot=0)
<Axes: ylabel='Product Name'>
customers_orders_by_shipping_mode = shipment_data.groupby('Shipping Mode', as_index=False)[['Customer Id',
'Order Id']].nunique().sort_values(by=['Customer Id','Order Id'], ascending=False)
customers_orders_by_shipping_mode
| Shipping Mode | Customer Id | Order Id | |
|---|---|---|---|
| 3 | Standard Class | 16578 | 39324 |
| 2 | Second Class | 8969 | 12778 |
| 0 | First Class | 7594 | 10079 |
| 1 | Same Day | 3208 | 3571 |
customers_orders_by_shipping_mode.plot(x='Shipping Mode', kind='bar', rot=20, color=['brown', 'magenta'], stacked=True)
## The viz shows the shipping mode with the highest orders by customer demands is the Standard Class shipping mode
<Axes: xlabel='Shipping Mode'>
sales_profit_by_shipping_mode = shipment_data.groupby('Shipping Mode', as_index=False)[['Benefit per order',
'Sales per customer']].sum().sort_values(by=['Benefit per order',
'Sales per customer'], ascending=False)
sales_profit_by_shipping_mode
| Shipping Mode | Benefit per order | Sales per customer | |
|---|---|---|---|
| 3 | Standard Class | 2.370454e+06 | 1.979222e+07 |
| 2 | Second Class | 7.503082e+05 | 6.416529e+06 |
| 0 | First Class | 6.431219e+05 | 5.100451e+06 |
| 1 | Same Day | 2.030184e+05 | 1.745202e+06 |
sales_profit_by_shipping_mode.plot(x='Shipping Mode', kind='barh', rot=0, color=['black', 'orange'], stacked=True)
<Axes: ylabel='Shipping Mode'>
customers_orders_by_shipping_mode = shipment_data.groupby('Shipping Mode', as_index=False)[['Customer Id',
'Order Id']].nunique().sort_values(by=['Customer Id','Order Id'], ascending=False)
customers_orders_by_shipping_mode
country_by_customers_orders = shipment_data.groupby('Customer Country', as_index=False)[['Customer Id',
'Order Id']].nunique().sort_values(by=['Customer Id','Order Id'], ascending=False)
country_by_customers_orders
| Customer Country | Customer Id | Order Id | |
|---|---|---|---|
| 0 | EE. UU. | 12719 | 40440 |
| 1 | Puerto Rico | 7933 | 25312 |
country_by_customers_orders.plot.bar(x='Customer Country',rot=0, grid=True,
color=['darkred', 'green'])
## The viz shows that EE. UU. countries are where most of the companies clients live(Over 60%)
## and the same with the orders, where orders are over 60% as well from EE. UU. countries
<Axes: xlabel='Customer Country'>
destination_country_of_orders = shipment_data.groupby('Order Country',
as_index=False)['Order Id'].nunique().sort_values(by='Order Id', ascending=False)
destination_country_of_orders.head(10)
| Order Country | Order Id | |
|---|---|---|
| 48 | Estados Unidos | 8270 |
| 53 | Francia | 4866 |
| 102 | México | 4395 |
| 8 | Australia | 3798 |
| 2 | Alemania | 3518 |
| 120 | Reino Unido | 2785 |
| 20 | Brasil | 2650 |
| 31 | China | 2616 |
| 69 | India | 2152 |
| 70 | Indonesia | 1920 |
destination_country_of_orders[:10].plot.barh(x='Order Country',rot=0, color='grey')
<Axes: ylabel='Order Country'>
## comparing the highest shipment delivery mode used the most by the orders
delivery_status_by_orders = shipment_data.groupby('Delivery Status',
as_index=False)['Order Id'].nunique().sort_values(by='Order Id', ascending=False)
delivery_status_by_orders
| Delivery Status | Order Id | |
|---|---|---|
| 1 | Late delivery | 36048 |
| 0 | Advance shipping | 15127 |
| 3 | Shipping on time | 11722 |
| 2 | Shipping canceled | 2855 |
delivery_status_by_orders.plot.bar(x='Delivery Status',rot=20, color='darkgreen')
## The viz below shows that more than 50% of the orders delivered were Late deliveries
<Axes: xlabel='Delivery Status'>