JSON 解析完全指南
Spark SQL 中处理 JSON 数据的所有姿势
SQL 专题版(原理讲解)
一、基础取值
get_json_object —— 取单个字段
-- 语法:get_json_object(json_string, path)
-- path 以 $ 开头,用 . 访问对象,用 [n] 访问数组
SELECT get_json_object('{"name":"张三","age":28}', '$.name');
-- 结果:张三
SELECT get_json_object('{"name":"张三","age":28}', '$.age');
-- 结果:28(注意:返回类型始终是 STRING)
注意事项:
- 返回值永远是 STRING,需要用 CAST 转类型
- 每次只能取一个字段,取多个字段需调用多次(性能差)
- path 不存在时返回 NULL
json_tuple —— 一次取多个字段
-- 一次解析多个字段,比多次 get_json_object 性能好
SELECT
j.name,
j.age,
j.city
FROM source_table
LATERAL VIEW json_tuple(json_col, 'name', 'age', 'city') j AS name, age, city;
vs get_json_object:
- json_tuple 只解析一次 JSON,性能更优
- 但 json_tuple 只能取第一层字段,不支持嵌套路径
- 需要配合 LATERAL VIEW 使用
from_json —— 转结构体(最强大)
-- 定义 schema,将 JSON 字符串转为结构体
SELECT
from_json(
'{"name":"张三","age":28,"scores":[90,85,92]}',
'struct<name:string, age:int, scores:array<int>>'
) AS parsed;
-- 转为结构体后可以直接用 . 取字段
SELECT
parsed.name,
parsed.age,
parsed.scores[0]
FROM (
SELECT from_json(json_col, 'struct<name:string, age:int, scores:array<int>>') AS parsed
FROM source_table
) t;
优势:类型安全,后续操作直接用结构体语法,无需反复解析。
二、数组处理
explode —— 展开 JSON 数组
-- 场景:一行包含多个元素的 JSON 数组,需要展开为多行
-- 原始数据:{"user":"张三","tags":["大数据","SQL","Python"]}
-- Step 1:取出数组字符串
-- Step 2:转为 array 类型
-- Step 3:explode 展开
SELECT
get_json_object(json_col, '$.user') AS user,
tag
FROM source_table
LATERAL VIEW explode(
from_json(get_json_object(json_col, '$.tags'), 'array<string>')
) t AS tag;
-- 结果:
-- 张三 大数据
-- 张三 SQL
-- 张三 Python
posexplode —— 带索引展开
-- 保留元素在数组中的位置
SELECT
user_id,
pos, -- 索引,从 0 开始
item
FROM source_table
LATERAL VIEW posexplode(
from_json(get_json_object(json_col, '$.items'), 'array<string>')
) t AS pos, item;
数组内取指定元素
-- 取数组第 N 个元素(0-based)
SELECT get_json_object(json_col, '$.scores[0]') AS first_score;
SELECT get_json_object(json_col, '$.scores[1]') AS second_score;
-- 取数组长度
SELECT size(from_json(get_json_object(json_col, '$.scores'), 'array<int>')) AS arr_len;
-- 取数组最后一个元素
SELECT
scores[size(scores) - 1] AS last_score
FROM (
SELECT from_json(get_json_object(json_col, '$.scores'), 'array<int>') AS scores
FROM source_table
) t;
三、嵌套解析
多层嵌套对象
-- 数据:{"user":{"name":"张三","address":{"city":"北京","district":"海淀"}}}
-- 方式一:get_json_object 直接用路径
SELECT
get_json_object(json_col, '$.user.name') AS name,
get_json_object(json_col, '$.user.address.city') AS city,
get_json_object(json_col, '$.user.address.district') AS district;
-- 方式二:from_json 定义完整 schema(推荐)
SELECT
parsed.user.name,
parsed.user.address.city,
parsed.user.address.district
FROM (
SELECT from_json(json_col,
'struct<user:struct<name:string, address:struct<city:string, district:string>>>'
) AS parsed
FROM source_table
) t;
数组嵌套对象
-- 数据:{"orders":[{"id":1,"amount":100},{"id":2,"amount":200}]}
SELECT
order_item.id,
order_item.amount
FROM source_table
LATERAL VIEW explode(
from_json(
get_json_object(json_col, '$.orders'),
'array<struct<id:int, amount:double>>'
)
) t AS order_item;
-- 结果:
-- 1 100.0
-- 2 200.0
对象嵌套数组
-- 数据:{"user":"张三","courses":[{"name":"数学","scores":[90,85,88]},{"name":"语文","scores":[78,82,80]}]}
SELECT
get_json_object(json_col, '$.user') AS user,
course.name AS course_name,
score
FROM source_table
LATERAL VIEW explode(
from_json(
get_json_object(json_col, '$.courses'),
'array<struct<name:string, scores:array<int>>>'
)
) t1 AS course
LATERAL VIEW explode(course.scores) t2 AS score;
-- 结果:
-- 张三 数学 90
-- 张三 数学 85
-- 张三 数学 88
-- 张三 语文 78
-- ...
四、Map 操作
str_to_map —— 字符串转 Map
-- 场景:KV 格式字符串 "k1=v1,k2=v2,k3=v3"
SELECT
kv_map['k1'] AS v1,
kv_map['k2'] AS v2
FROM (
SELECT str_to_map('k1=v1,k2=v2,k3=v3', ',', '=') AS kv_map
) t;
Map 常用操作
-- map_keys:获取所有 key
SELECT map_keys(str_to_map('a=1,b=2,c=3', ',', '='));
-- 结果:["a","b","c"]
-- map_values:获取所有 value
SELECT map_values(str_to_map('a=1,b=2,c=3', ',', '='));
-- 结果:["1","2","3"]
-- element_at:按 key 取值(比 [] 更安全,key 不存在返回 NULL)
SELECT element_at(map_col, 'target_key');
-- map_from_arrays:两个数组组合为 Map
SELECT map_from_arrays(array('a','b','c'), array(1,2,3));
-- explode Map 为多行
SELECT key, value
FROM source_table
LATERAL VIEW explode(str_to_map(kv_col, ',', '=')) t AS key, value;
五、结构体操作
-- named_struct:构造结构体
SELECT named_struct('name', '张三', 'age', 28, 'city', '北京') AS info;
-- 结构体取字段
SELECT info.name, info.age FROM (
SELECT named_struct('name', '张三', 'age', 28) AS info
) t;
-- 结构体数组
SELECT
collect_list(named_struct('id', id, 'name', name)) AS user_list
FROM users
GROUP BY dept;
六、Schema 推断
-- schema_of_json:自动推断 JSON 的 schema(开发调试用)
SELECT schema_of_json('{"name":"张三","age":28,"scores":[90,85]}');
-- 结果:struct<name:string,age:bigint,scores:array<bigint>>
-- 用推断结果直接传给 from_json
SELECT from_json(json_col, schema_of_json('{"name":"张三","age":28}'))
FROM source_table;
注意:schema_of_json 基于单条样本推断,生产环境建议手动定义 schema 以确保类型准确。
七、异常处理
JSON 格式不合法
-- get_json_object 对非法 JSON 返回 NULL
SELECT get_json_object('not a json', '$.name');
-- 结果:NULL
-- 过滤非法 JSON 行
SELECT *
FROM source_table
WHERE get_json_object(json_col, '$') IS NOT NULL;
-- 用 try_to_number 等 try_ 系列函数安全转换
SELECT
CAST(get_json_object(json_col, '$.age') AS INT) -- 非数字会报错
-- vs
TRY_CAST(get_json_object(json_col, '$.age') AS INT) -- 非数字返回 NULL(Spark 3.4+)
字段缺失
-- 字段不存在时返回 NULL,用 COALESCE 给默认值
SELECT
COALESCE(get_json_object(json_col, '$.nickname'), '未知') AS nickname;
类型不匹配
-- from_json schema 与实际数据不匹配时,整个结构体返回 NULL
-- 建议:先用 schema_of_json 确认实际类型,再定义 schema
-- 常见坑:JSON 中 age 有时是 "28"(字符串),有时是 28(数字)
-- 解决:schema 中用 string 接收,后续 CAST
SELECT CAST(parsed.age AS INT)
FROM (
SELECT from_json(json_col, 'struct<age:string>') AS parsed
FROM source_table
) t;
八、JSON 写出
-- to_json:结构体/Map 转 JSON 字符串
SELECT to_json(named_struct('name', '张三', 'age', 28));
-- 结果:{"name":"张三","age":28}
-- 数组转 JSON
SELECT to_json(array(1, 2, 3));
-- 结果:[1,2,3]
-- Map 转 JSON
SELECT to_json(map('name', '张三', 'city', '北京'));
-- 结果:{"name":"张三","city":"北京"}
-- 聚合后输出 JSON 数组
SELECT
dept,
to_json(collect_list(named_struct('name', name, 'salary', salary))) AS employees_json
FROM employees
GROUP BY dept;
九、实战场景
场景一:埋点日志解析
-- 原始日志:{"event":"click","ts":1700000000,"props":{"page":"home","button_id":"buy_btn","ext":{"ab_test":"v2"}}}
SELECT
get_json_object(log, '$.event') AS event_name,
from_unixtime(CAST(get_json_object(log, '$.ts') AS BIGINT)) AS event_time,
get_json_object(log, '$.props.page') AS page,
get_json_object(log, '$.props.button_id') AS button_id,
get_json_object(log, '$.props.ext.ab_test') AS ab_test
FROM ods_event_log
WHERE dt = '${bizdate}';
场景二:接口返回值解析
-- API 响应:{"code":200,"data":{"list":[{"id":1,"name":"商品A"},{"id":2,"name":"商品B"}],"total":100}}
SELECT
get_json_object(response, '$.code') AS code,
get_json_object(response, '$.data.total') AS total,
item.id,
item.name
FROM api_response_table
LATERAL VIEW explode(
from_json(
get_json_object(response, '$.data.list'),
'array<struct<id:int, name:string>>'
)
) t AS item
WHERE get_json_object(response, '$.code') = '200';
场景三:动态 KV 属性解析
-- 用户属性存储为动态 JSON:{"vip_level":"3","register_channel":"wechat","preferences":"sports,tech"}
-- 方式一:已知要取哪些 key
SELECT
user_id,
get_json_object(attrs, '$.vip_level') AS vip_level,
get_json_object(attrs, '$.register_channel') AS channel
FROM user_profile;
-- 方式二:将动态 KV 全部展开为行
SELECT
user_id,
attr_key,
attr_value
FROM user_profile
LATERAL VIEW explode(
from_json(attrs, 'map<string,string>')
) t AS attr_key, attr_value;
实用工具版(速查模板)
快速对照表
| 需求 | 用什么 | 示例 |
|---|---|---|
| 取单个字段 | get_json_object(col, '$.key') | get_json_object(j, '$.name') |
| 取多个一级字段 | json_tuple + LATERAL VIEW | 见下方 |
| 取嵌套字段 | get_json_object(col, '$.a.b.c') | get_json_object(j, '$.user.addr.city') |
| JSON→结构体 | from_json(col, schema) | 见下方 |
| 展开 JSON 数组 | explode(from_json(..., 'array<...>')) | 见下方 |
| 带索引展开 | posexplode(...) | posexplode(arr) AS pos, item |
| 取数组第N个 | get_json_object(col, '$.arr[0]') | 0-based 索引 |
| KV 字符串→Map | str_to_map(col, ',', '=') | str_to_map('a=1,b=2',',','=') |
| Map 取值 | element_at(map, key) 或 map['key'] | |
| 结构体→JSON | to_json(struct) | |
| 推断 Schema | schema_of_json(sample) | 开发调试用 |
常用模板
-- ① 取单个嵌套字段
get_json_object(json_col, '$.level1.level2.field')
-- ② 一次取多个一级字段
SELECT j.*
FROM table LATERAL VIEW json_tuple(json_col, 'f1', 'f2', 'f3') j AS f1, f2, f3;
-- ③ 展开 JSON 对象数组
SELECT item.field1, item.field2
FROM table
LATERAL VIEW explode(from_json(get_json_object(json_col, '$.array_field'), 'array<struct<field1:type1, field2:type2>>')) t AS item;
-- ④ 两层展开(数组套数组)
SELECT outer_item.name, inner_val
FROM table
LATERAL VIEW explode(from_json(get_json_object(json_col, '$.outer'), 'array<struct<name:string, inner:array<int>>>')) t1 AS outer_item
LATERAL VIEW explode(outer_item.inner) t2 AS inner_val;
-- ⑤ 动态 KV 展开为行
SELECT key, value
FROM table
LATERAL VIEW explode(from_json(json_col, 'map<string,string>')) t AS key, value;
-- ⑥ JSON 格式校验(过滤脏数据)
WHERE get_json_object(json_col, '$') IS NOT NULL
-- ⑦ 安全取值 + 默认值
COALESCE(get_json_object(json_col, '$.field'), '默认值')
-- ⑧ 取数组长度
size(from_json(get_json_object(json_col, '$.arr'), 'array<string>'))