Select distinct values in bigquery using standardsql

bigquery-standard-sqlgoogle-bigquery

i want to select multiple columns and group the email with GROUP BY

#standardSQL
SELECT
      customers.orderCustomerEmail AS email,      
      customers.orderCustomerNumber AS customerNumber,
      customers.billingFirstname AS billingFirstname,
      customers.billingLastname AS billingLastname
FROM dim_customers AS customers
GROUP BY customers.orderCustomerEmail

Fails with:

Error: SELECT list expression references customers.orderCustomerNumber
       which is neither grouped nor aggregated at [4:7]

This is similar to this question Bigquery select distinct values

but it doesn't solve my problem because its a different result adding all columns to GROUP BY same with SELECT DISTINCT

dim_customer schema:

orderCustomerEmail:STRING,
billingFirstname:STRING,
billingLastname:STRING,
orderCustomerNumber:STRING,
OrderNumber:STRING

dummy data: https://docs.google.com/spreadsheets/d/1T1JZRWni18hhU4tO-9kQqq5Y3hVWgpP-aE7o6ij9bDE/edit?usp=sharing

Best Answer

When you group by some columns, you need to make sure you apply some aggregation function(s) to the rest of the columns. otherwise you get exactly error that you show in your question

Try below example for BigQuery Standard SQL

#standardSQL
SELECT 
  customers.orderCustomerEmail AS email,      
  ARRAY_AGG(STRUCT(customers.orderCustomerNumber AS customerNumber,
  customers.billingFirstname AS billingFirstname,
  customers.billingLastname AS billingLastname)) AS info
FROM `dim_customers`, UNNEST(customers) AS customers
GROUP BY email

or with just simple DISTINCT

#standardSQL
SELECT DISTINCT 
  customers.orderCustomerEmail AS email,      
  customers.orderCustomerNumber AS customerNumber,
  customers.billingFirstname AS billingFirstname,
  customers.billingLastname AS billingLastname
FROM `dim_customers`, UNNEST(customers) AS customers

Please note: your question is not specific enough in terms of what exactly output you expect, so above will most likely require some adjustment to your specific needs

Update

i need basically one row per customer (email being the unique identifier, hence the group) the details (number, firstname, lastname) can be taken from the last entry e.g

#standardSQL
WITH `dim_customers` AS (
  SELECT [
    STRUCT('a' AS orderCustomerEmail, 1 AS orderCustomerNumber, 'af' AS billingFirstname, 'al' AS billingLastname),
    STRUCT('a' AS orderCustomerEmail, 4 AS orderCustomerNumber, 'af1' AS billingFirstname, 'al2' AS billingLastname),
    STRUCT('b' AS orderCustomerEmail, 2 AS orderCustomerNumber, 'bf' AS billingFirstname, 'bl' AS billingLastname),
    STRUCT('c' AS orderCustomerEmail, 3 AS orderCustomerNumber, 'cf' AS billingFirstname, 'cl' AS billingLastname)
    ] AS customers UNION ALL
  SELECT [
    STRUCT('a' AS orderCustomerEmail, 1 AS orderCustomerNumber, 'af' AS billingFirstname, 'al' AS billingLastname),
    STRUCT('a' AS orderCustomerEmail, 4 AS orderCustomerNumber, 'af1' AS billingFirstname, 'al2' AS billingLastname),
    STRUCT('b' AS orderCustomerEmail, 2 AS orderCustomerNumber, 'bf' AS billingFirstname, 'bl' AS billingLastname),
    STRUCT('c' AS orderCustomerEmail, 3 AS orderCustomerNumber, 'cf' AS billingFirstname, 'cl' AS billingLastname)
    ] AS customers
)
SELECT
  customers.orderCustomerEmail AS email,      
  ARRAY_AGG(STRUCT(customers.orderCustomerNumber AS customerNumber,
    customers.billingFirstname AS billingFirstname,
    customers.billingLastname AS billingLastname))[OFFSET(0)] AS info
FROM `dim_customers`, UNNEST(customers) AS customers
GROUP BY email

Update

below is for updated schema!

dim_customer schema:

orderCustomerEmail:STRING,
billingFirstname:STRING,
billingLastname:STRING,
orderCustomerNumber:STRING,
OrderNumber:STRING

#standardSQL
WITH `dim_customers` AS (
  SELECT 10201 AS orderCustomerNumber, 'a@email.com' AS orderCustomerEmail, 'Alex' AS billingFirstname, 'Miller' AS billingLastname UNION ALL
  SELECT 10202, 'b@email.com', 'Ben', 'Williams' UNION ALL
  SELECT 10203, 'c@email.com', 'Chris', 'Collins' UNION ALL
  SELECT 10204, 'd@email.com', 'David', 'Hems' UNION ALL
  SELECT 10201, 'a@email.com', 'A.', 'Miller' UNION ALL
  SELECT 10201, 'a@email.com', 'A.', 'Miller' UNION ALL
  SELECT 10202, 'b@email.com', 'Ben', 'Williams' UNION ALL
  SELECT 10202, 'b@email.com', 'Bens Father', 'Williams' UNION ALL
  SELECT 10205, 'a@email.com', 'A.', 'Miller' UNION ALL
  SELECT 10206, 'e@email.com', 'Ed', 'Winchell'
)
SELECT info.* FROM (
  SELECT
    orderCustomerEmail AS email, 
    ARRAY_AGG(STRUCT(
      orderCustomerEmail AS email, 
      orderCustomerNumber AS customerNumber,
      billingFirstname AS billingFirstname,
      billingLastname AS billingLastname))[OFFSET(0)] AS info
  FROM `dim_customers`
  GROUP BY email
)
-- ORDER BY email
Related Topic