GSP-340 : Build and Optimize Data Warehouses with BigQuery: Challenge Lab

GSP-340 : Build and Optimize Data Warehouses with BigQuery: Challenge Lab

Overview

Create a dataset

1bq mk <Dataset Name>

Task - 1 : Create a table partitioned by date

 1CREATE OR REPLACE TABLE <dataset_name>.<table_name>
 2PARTITION BY date
 3OPTIONS(
 4partition_expiration_days=360,
 5description="oxford_policy_tracker table in the COVID 19 Government Response public dataset with  an expiry time set to 90 days."
 6) AS
 7SELECT
 8   *
 9FROM
10   `bigquery-public-data.covid19_govt_response.oxford_policy_tracker`
11WHERE
12   alpha_3_code NOT IN ('GBR', 'BRA', 'CAN','USA')

Task - 2 : Add new columns to your table

 1ALTER TABLE <dataset_name>.<table_name>
 2ADD COLUMN population INT64,
 3ADD COLUMN country_area FLOAT64,
 4ADD COLUMN mobility STRUCT<
 5   avg_retail      FLOAT64,
 6   avg_grocery     FLOAT64,
 7   avg_parks       FLOAT64,
 8   avg_transit     FLOAT64,
 9   avg_workplace   FLOAT64,
10   avg_residential FLOAT64
11>

Task - 3 : Add country population data to the population column

 1CREATE OR REPLACE TABLE <dataset_name>.pop_data_2019 AS
 2SELECT
 3  country_territory_code,
 4  pop_data_2019
 5FROM 
 6  `bigquery-public-data.covid19_ecdc.covid_19_geographic_distribution_worldwide`
 7GROUP BY
 8  country_territory_code,
 9  pop_data_2019
10ORDER BY
11  country_territory_code
1UPDATE
2   `<dataset_name>.<table_name>` t0
3SET
4   population = t1.pop_data_2019
5FROM
6   `<dataset_name>.pop_data_2019` t1
7WHERE
8   CONCAT(t0.alpha_3_code) = CONCAT(t1.country_territory_code);

Task - 4 : Add country area data to the country_area column

1UPDATE
2   `<dataset_name>.<table_name>` t0
3SET
4   t0.country_area = t1.country_area
5FROM
6   `bigquery-public-data.census_bureau_international.country_names_area` t1
7WHERE
8   t0.country_name = t1.country_name

Task - 5 : Populate the mobility record data

 1UPDATE
 2   `<dataset_name>.<table_name>` t0
 3SET
 4   t0.mobility.avg_retail      = t1.avg_retail,
 5   t0.mobility.avg_grocery     = t1.avg_grocery,
 6   t0.mobility.avg_parks       = t1.avg_parks,
 7   t0.mobility.avg_transit     = t1.avg_transit,
 8   t0.mobility.avg_workplace   = t1.avg_workplace,
 9   t0.mobility.avg_residential = t1.avg_residential
10FROM
11   ( SELECT country_region, date,
12      AVG(retail_and_recreation_percent_change_from_baseline) as avg_retail,
13      AVG(grocery_and_pharmacy_percent_change_from_baseline)  as avg_grocery,
14      AVG(parks_percent_change_from_baseline) as avg_parks,
15      AVG(transit_stations_percent_change_from_baseline) as avg_transit,
16      AVG(workplaces_percent_change_from_baseline) as avg_workplace,
17      AVG(residential_percent_change_from_baseline)  as avg_residential
18      FROM `bigquery-public-data.covid19_google_mobility.mobility_report`
19      GROUP BY country_region, date
20   ) AS t1
21WHERE
22   CONCAT(t0.country_name, t0.date) = CONCAT(t1.country_region, t1.date)
23   

Task - 6 : Query missing data in population & country_area columns

1SELECT country_name, population
2FROM `<dataset_name>.<table_name>`
3WHERE population is NULL
1SELECT country_name, country_area
2FROM `<dataset_name>.<table_name>`
3WHERE country_area IS NULL
1SELECT DISTINCT country_name
2FROM `<dataset_name>.<table_name>`
3WHERE population is NULL
4UNION ALL
5SELECT DISTINCT country_name
6FROM `<dataset_name>.<table_name>`
7WHERE country_area IS NULL
8ORDER BY country_name ASC

Congratulations, you're all done with the lab 😄