-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathExploratory_Data_Analysis_(EDA).sql
More file actions
195 lines (155 loc) · 5.97 KB
/
Exploratory_Data_Analysis_(EDA).sql
File metadata and controls
195 lines (155 loc) · 5.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
--========================================
-- Data Exploration
--========================================
-- Explore all objects in the database
SELECT * FROM INFORMATION_SCHEMA.TABLES
-- Explore all columns in the database
SELECT * FROM INFORMATION_SCHEMA.columns
WHERE table_name = 'dim_products'
--========================================
-- Dimension Exploration
--========================================
-- Explore the categories of the products' the sells
SELECT DISTINCT Category, subcategory, product_name FROM gold.dim_products
ORDER BY category
-- Explore customers' countries
SELECT DISTINCT country FROM gold.dim_customers
--========================================
-- Date Exploration
--========================================
-- How many years of sales are available
SELECT
MIN(order_date) AS earliest_order_date,
MAX(order_date) AS last_order_date,
EXTRACT(years FROM AGE(MAX(order_date), MIN(order_date))) AS orders_range_years,
EXTRACT(YEAR FROM AGE(CURRENT_DATE, MAX(order_date))) AS years_since_last_order,
MIN(shipping_date) AS earliest_shipping_date,
MAX(shipping_date) AS last_shipping_date,
MIN(due_date) AS earliest_due_date,
MAX(due_date) AS last_due_date
FROM gold.fact_sales;
-- Find the youngest and oldest customers
SELECT
MIN(birth_date) oldest_customer,
EXTRACT(year FROM AGE (CURRENT_DATE, MIN(birth_date))) oldest_customer_age,
MAX(birth_date) youngest_customer,
EXTRACT(year FROM AGE (CURRENT_DATE, MAX(birth_date))) youngest_customer_age
FROM gold.dim_customers
--========================================
-- Measure Exploration
--========================================
-- Find the Total Sales
SELECT SUM(sales_amount) FROM gold.fact_sales
-- Find how many items are sold
SELECT COUNT(quantity) FROM gold.fact_sales
-- Find the average selling price
SELECT ROUND(AVG(price)) FROM gold.fact_sales
-- Find the Total number of Orders
SELECT COUNT(order_number) FROM gold.fact_sales
SELECT COUNT(DISTINCT order_number) FROM gold.fact_sales
-- Find the total number of products
SELECT COUNT(DISTINCT product_name) FROM gold.dim_products
-- Find the total number of customers
SELECT COUNT(customer_key) FROM gold.dim_customers
-- Find the total number of customers that has placed an order (maybe there are some who registered but didn't place an order)
SeLeCT COUNT(DISTINCT customer_key) FROM gold.fact_sales
-- Generate a report that contains all key metrics of the business
SELECT 'Total Sales' AS measure_name ,SUM(sales_amount) AS measure_value FROM gold.fact_sales
UNION all
SELECT 'Total Quantity', COUNT(quantity) FROM gold.fact_sales
UNION all
SELECT 'Average Price', ROUND(AVG(price)) FROM gold.fact_sales
UNION all
SELECT 'Total Orders', COUNT(DISTINCT order_number) FROM gold.fact_sales
UNION all
SELECT 'Nr.of Products', COUNT(DISTINCT product_name) FROM gold.dim_products
UNION all
SELECT 'Nr.of Customers',COUNT(customer_key) FROM gold.dim_customers
UNION all
SeLeCT 'Nr.of Customers who Ordered', COUNT(DISTINCT customer_key) FROM gold.fact_sales
--========================================
-- Magnitude Exploration
--========================================
-- Find total customers by countries
SELECT country,COUNT(customer_key) total_customers FROM gold.dim_customers GROUP BY country ORDER BY COUNT(customer_key) DESC
-- Find total customers by gender
SELECT gender, COUNT(customer_key) total_customers FROM gold.dim_customers GROUP BY gender ORDER BY COUNT(customer_key) DESC
-- Find total products by category
SELECT category,COUNT(product_name) FROM gold.dim_products GROUP BY category ORDER BY COUNT(product_name) DESC
-- What is the average costs in each category?
SELECT category,AVG(product_cost) FROM gold.dim_products GROUP BY category ORDER BY AVG(product_cost) DESC
-- What is the total revenue generated for each category?
SELECT
dp.category,
SUM(sales_amount)
FROM gold.fact_sales fc
LEFT JOIN gold.dim_products dp
ON fc.product_key = dp.product_key
GROUP BY category ORDER BY SUM(sales_amount) DESC
-- Find total revenue that is generated by each customer
SELECT customer_key, SUM(sales_amount) FROM gold.fact_sales GROUP BY customer_key ORDER BY SUM(sales_amount) DESC
-- What is the distribution of sold items across countries?
SELECT
dc.country,
SUM(quantity)
FROM gold.fact_sales fc
LEFT JOIN gold.dim_customers dc
ON fc.customer_key = dc.customer_key
GROUP BY country ORDER BY SUM(quantity) DESC
--========================================
-- Ranking Analysis
--========================================
-- which products generated the highest revenue
SELECT
dp.product_name,
SUM(sales_amount)
FROM gold.fact_sales fc
LEFT JOIN gold.dim_products dp
ON fc.product_key = dp.product_key
GROUP BY product_name ORDER BY SUM(sales_amount) DESC
LIMIT 5;
-- can be solved also like this
/* SELECT * FROM (
SELECT
dp.product_name,
SUM(sales_amount) total_revenue,
ROW_NUMBER() OVER (ORDER BY SUM(sales_amount) DESC) Ranking
FROM gold.fact_sales fc
LEFT JOIN gold.dim_products dp
ON fc.product_key = dp.product_key
GROUP BY product_name
)
WHERE Ranking <= 5 */
-- What are the worst performing products in term of sales
SELECT
dp.product_name,
SUM(sales_amount) total_revenue
FROM gold.fact_sales fc
LEFT JOIN gold.dim_products dp
ON fc.product_key = dp.product_key
GROUP BY product_name ORDER BY SUM(sales_amount) ASC
LIMIT 5;
-- Find the top 10 customers who have generated the highest revenue
SELECT
dc.customer_key,
first_name,
last_name,
SUM(sales_amount)
FROM gold.fact_sales fc
LEFT JOIN gold.dim_customers dc
ON fc.customer_key = dc.customer_key
GROUP BY dc.customer_key, first_name ,last_name
ORDER BY SUM(sales_amount) DESC
LIMIT 10;
-- and the 3 customers with the fewest orders placed
SELECT
dc.customer_key,
first_name,
last_name,
COUNT(order_number)
FROM gold.fact_sales fc
LEFT JOIN gold.dim_customers dc
ON fc.customer_key = dc.customer_key
GROUP BY dc.customer_key, first_name ,last_name
ORDER BY COUNT(order_number) ASC
LIMIT 3;