BigQuery
Updated: May 22, 2026Categories: Query, Analytics
Printed from:
BigQuery SQL Cheatsheet: Petabyte-Scale Analytics
1. Connection and Basic Setup
gcloud CLI Authentication
Bash
12345678910111213# Install gcloud CLI
curl https://sdk.cloud.google.com | bash
# Initialize and authenticate
gcloud init
gcloud auth login
# Set application default credentials (for client libraries)
gcloud auth application-default login
# Set active project
gcloud config set project YOUR_PROJECT_ID
Web UI Access
- Navigate to https://console.cloud.google.com/bigquery
- Select project from dropdown
- Use BigQuery Studio for unified query, notebook, and pipeline authoring
- Use the
bqCLI for scripted operations (bq query --use_legacy_sql=false '...')
2. Project, Dataset, and Table Operations
Create Dataset
SQL
123456789-- GoogleSQL (the default and recommended dialect)
CREATE SCHEMA `project.dataset`
OPTIONS (
description = 'My analytics dataset',
location = 'US',
labels = [('environment', 'production')],
default_table_expiration_days = 30
);
Create Table
SQL
123456789101112131415161718192021-- Create table with schema
CREATE TABLE `project.dataset.table` (
id INT64,
name STRING,
created_at TIMESTAMP
);
-- Create table from query
CREATE TABLE `project.dataset.derived_table` AS
SELECT column1, column2
FROM `source_table`
WHERE condition;
-- Create or replace with primary/foreign key (informational, non-enforced)
CREATE OR REPLACE TABLE `project.dataset.orders` (
order_id INT64 NOT NULL,
customer_id INT64 NOT NULL,
PRIMARY KEY (order_id) NOT ENFORCED,
FOREIGN KEY (customer_id) REFERENCES `project.dataset.customers`(id) NOT ENFORCED
);
3. BigQuery-Specific Data Types
STRUCT
SQL
12345678910-- Creating a STRUCT
SELECT STRUCT(
'John' AS first_name,
'Doe' AS last_name
) AS person;
-- Accessing STRUCT fields
SELECT person.first_name
FROM table;
ARRAY
SQL
1234567-- Creating an ARRAY
SELECT ['apple', 'banana', 'cherry'] AS fruits;
-- Unnesting ARRAY
SELECT fruit
FROM table, UNNEST(fruits) AS fruit;
JSON
SQL
12345678910-- Native JSON type (preferred over STRING for semi-structured data)
SELECT JSON '{"id": 1, "name": "Alice"}' AS payload;
-- Extracting fields with the dot/subscript accessors
SELECT
payload.id AS id,
STRING(payload.name) AS name,
INT64(payload.id) AS id_int
FROM table;
GEOGRAPHY
SQL
1234567-- Creating a geographic point
SELECT ST_GEOGPOINT(-122.4194, 37.7749) AS san_francisco;
-- Calculating distance (meters)
SELECT ST_DISTANCE(point1, point2) AS distance_m
FROM geo_table;
RANGE
SQL
123-- RANGE<DATE>, RANGE<DATETIME>, RANGE<TIMESTAMP>
SELECT RANGE<DATE> '[2026-01-01, 2026-04-01)' AS q1_window;
4. GoogleSQL Operations
SELECT with Advanced Filtering
SQL
12345678910SELECT
column1,
column2
FROM `project.dataset.table`
WHERE
column1 IS NOT NULL
AND column2 BETWEEN 10 AND 100
ORDER BY column1 DESC
LIMIT 1000;
Pipe Syntax (alternative top-down query form)
SQL
1234567FROM `project.dataset.table`
|> WHERE column1 IS NOT NULL
|> AGGREGATE COUNT(*) AS n, AVG(column2) AS avg_val
GROUP BY column1
|> ORDER BY n DESC
|> LIMIT 100;
INSERT
SQL
12345INSERT INTO `project.dataset.table`
(id, name, created_at)
VALUES
(1, 'Test', CURRENT_TIMESTAMP());
MERGE (upsert)
SQL
12345678MERGE `project.dataset.target` T
USING `project.dataset.source` S
ON T.id = S.id
WHEN MATCHED THEN
UPDATE SET name = S.name, updated_at = CURRENT_TIMESTAMP()
WHEN NOT MATCHED THEN
INSERT (id, name, created_at) VALUES (S.id, S.name, CURRENT_TIMESTAMP());
UPDATE
SQL
123456UPDATE `project.dataset.table`
SET
status = 'active',
updated_at = CURRENT_TIMESTAMP()
WHERE id = 123;
DELETE
SQL
1234DELETE FROM `project.dataset.table`
WHERE status = 'inactive'
AND created_at < TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 30 DAY);
TRUNCATE
SQL
12TRUNCATE TABLE `project.dataset.staging`;
5. Analytics Functions
Window Functions
SQL
12345678-- Ranking within partitions
SELECT
department,
employee_name,
salary,
RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS salary_rank
FROM employees;
Statistical and Approximate Functions
SQL
123456789-- Exact + approximate aggregation
SELECT
AVG(value) AS mean,
STDDEV(value) AS standard_deviation,
APPROX_QUANTILES(value, 4) AS quartiles,
APPROX_COUNT_DISTINCT(user_id) AS approx_users,
APPROX_TOP_COUNT(country, 5) AS top_countries
FROM metrics;
6. Array and Struct Operations
Array Aggregation
SQL
1234567-- Transform rows to array
SELECT
user_id,
ARRAY_AGG(DISTINCT product ORDER BY product) AS purchased_products
FROM purchases
GROUP BY user_id;
Struct Manipulation
SQL
12345678910-- Complex struct operations
SELECT
ARRAY(
SELECT AS STRUCT
name,
age,
ARRAY(SELECT score FROM UNNEST(scores)) AS test_scores
FROM users
) AS user_details;
7. Date and Time Functions
Date Manipulation
SQL
12345678-- Date arithmetic and parsing
SELECT
DATE_ADD(CURRENT_DATE(), INTERVAL 30 DAY) AS future_date,
EXTRACT(YEAR FROM ts) AS year,
FORMAT_DATE('%Y-%m', CURRENT_DATE()) AS month_year,
DATE_TRUNC(CURRENT_DATE(), MONTH) AS month_start
FROM table;
8. Geographic Functions
Spatial Queries
SQL
12345678910-- Find points within a region
SELECT
location,
ST_WITHIN(location, region) AS is_inside
FROM geo_data
WHERE ST_WITHIN(
location,
ST_GEOGFROMTEXT('POLYGON((-122.5 37.7, -122.3 37.7, -122.3 37.8, -122.5 37.8, -122.5 37.7))')
);
9. Machine Learning Integration (BigQuery ML)
Create and Train Model
SQL
1234567CREATE OR REPLACE MODEL `project.dataset.model`
OPTIONS(
model_type = 'linear_reg',
input_label_cols = ['target']
) AS
SELECT * FROM training_data;
Prediction
SQL
123456SELECT *
FROM ML.PREDICT(
MODEL `project.dataset.model`,
(SELECT * FROM prediction_data)
);
Generative AI via remote models
SQL
123456789101112-- Call a Vertex AI / Gemini model from SQL
CREATE OR REPLACE MODEL `project.dataset.gemini`
REMOTE WITH CONNECTION `project.us.my_connection`
OPTIONS (endpoint = 'gemini-2.0-flash');
SELECT *
FROM ML.GENERATE_TEXT(
MODEL `project.dataset.gemini`,
(SELECT review AS prompt FROM `project.dataset.reviews`),
STRUCT(0.2 AS temperature, TRUE AS flatten_json_output)
);
Vector search
SQL
1234567891011-- Generate embeddings and run approximate nearest-neighbor search
CREATE OR REPLACE VECTOR INDEX docs_idx
ON `project.dataset.docs`(embedding)
OPTIONS (index_type = 'IVF', distance_type = 'COSINE');
SELECT base.id, distance
FROM VECTOR_SEARCH(
TABLE `project.dataset.docs`, 'embedding',
(SELECT embedding FROM `project.dataset.query_embedding`),
top_k => 10);
10. Performance Optimization
Partitioning
SQL
123456-- Time-unit, ingestion-time, or integer-range partitioning
CREATE TABLE `project.dataset.partitioned_table`
PARTITION BY DATE(timestamp)
OPTIONS (require_partition_filter = TRUE)
AS SELECT * FROM source_table;
Clustering
SQL
123456-- Up to 4 clustering columns; order matters
CREATE TABLE `project.dataset.clustered_table`
PARTITION BY DATE(event_ts)
CLUSTER BY user_id, country
AS SELECT * FROM source_table;
Materialized Views and Search Indexes
SQL
1234567891011121314CREATE MATERIALIZED VIEW `project.dataset.mv_daily`
PARTITION BY day
AS
SELECT DATE(event_ts) AS day, country, COUNT(*) AS n
FROM `project.dataset.events`
GROUP BY day, country;
-- Accelerate text / needle-in-haystack lookups
CREATE SEARCH INDEX events_idx
ON `project.dataset.events`(ALL COLUMNS);
SELECT * FROM `project.dataset.events`
WHERE SEARCH(events, 'error_code:500');
11. Cost Optimization Strategies
- Prefer on-demand billing for spiky workloads; use Editions (Standard / Enterprise / Enterprise Plus) with autoscaling slot reservations for predictable workloads
- Always apply a partition filter (
require_partition_filter = TRUEenforces it) - Avoid
SELECT *; project only the columns you need - Use
--dry_run(bq query --dry_run) or the UI estimator to preview bytes scanned - Use BI Engine reservations to cache hot data for sub-second dashboards
- Prefer materialized views and search/vector indexes over re-aggregating raw tables
- Set custom quotas, budgets, and per-user/project query byte limits
12. External Data Sources
External Table
SQL
123456CREATE EXTERNAL TABLE `project.dataset.external_table`
OPTIONS (
format = 'CSV',
uris = ['gs://bucket/path/*.csv']
);
BigLake and Iceberg
SQL
1234567891011121314151617181920-- BigLake table over Cloud Storage with fine-grained access control
CREATE EXTERNAL TABLE `project.dataset.biglake_parquet`
WITH CONNECTION `project.us.my_connection`
OPTIONS (
format = 'PARQUET',
uris = ['gs://bucket/data/*.parquet']
);
-- BigQuery-managed Apache Iceberg table (read/write, open format on GCS)
CREATE TABLE `project.dataset.iceberg_orders` (
order_id INT64,
amount NUMERIC,
order_date DATE
)
WITH CONNECTION `project.us.my_connection`
OPTIONS (
table_format = 'ICEBERG',
storage_uri = 'gs://bucket/iceberg/orders'
);
Cross-cloud queries with Omni
SQL
12345-- Query data in Amazon S3 or Azure Blob via a BigQuery Omni connection
SELECT *
FROM `project.aws_us_east_1.s3_table`
WHERE event_date = CURRENT_DATE();
13. Best Practices
- Use GoogleSQL (legacy SQL is deprecated and disabled by default in new projects)
- Leverage
LIMITfor ad-hoc exploration only — it does not reduce bytes scanned - Use approximate aggregation functions for very large cardinalities
- Partition large tables and require partition filters
- Cluster by frequently filtered/grouped columns
- Avoid
SELECT *; rely on column pruning - Use STRUCT, ARRAY, JSON, and RANGE for semi-structured data instead of denormalizing
- Monitor performance with
INFORMATION_SCHEMA.JOBS_BY_*views and the query execution graph - Use change history (
APPENDS,CHANGES) and time travel (FOR SYSTEM_TIME AS OF) for incremental pipelines - Manage governance with Dataplex/Dataform, policy tags, row-level and column-level access
Additional Resources
Continue Learning
Discover more cheatsheets to boost your productivity