Skip to content

Commit 2326ad6

Browse files
authored
fix: handle unsupported types and empty results in describe (#2506)
The .describe() method attempts to compute metrics like mean, max, and unique on all columns. For MM complex types (structs representing images/videos), running COUNT(DISTINCT column) or mathematical aggregates is physically prohibited by BigQuery and raises syntax/type validation errors. We limit aggregations on OBJ_REF_DTYPE and JSON columns to only a basic .count(), skipping unhashable/unsupported summary metrics. Fixes #<452681068> 🦕
1 parent 91b6c24 commit 2326ad6

File tree

2 files changed

+45
-3
lines changed

2 files changed

+45
-3
lines changed

bigframes/pandas/core/methods/describe.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,10 @@ def describe(
5656
"max",
5757
]
5858
).intersection(describe_block.column_labels.get_level_values(-1))
59-
describe_block = describe_block.stack(override_labels=stack_cols)
60-
61-
return dataframe.DataFrame(describe_block).droplevel(level=0)
59+
if not stack_cols.empty:
60+
describe_block = describe_block.stack(override_labels=stack_cols)
61+
return dataframe.DataFrame(describe_block).droplevel(level=0)
62+
return dataframe.DataFrame(describe_block)
6263

6364

6465
def _describe(
@@ -120,5 +121,7 @@ def _get_aggs_for_dtype(dtype) -> list[aggregations.UnaryAggregateOp]:
120121
dtypes.TIME_DTYPE,
121122
]:
122123
return [aggregations.count_op, aggregations.nunique_op]
124+
elif dtypes.is_json_like(dtype) or dtype == dtypes.OBJ_REF_DTYPE:
125+
return [aggregations.count_op]
123126
else:
124127
return []

tests/system/small/pandas/test_describe.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import pandas.testing
1616
import pytest
1717

18+
import bigframes.pandas as bpd
19+
1820

1921
def test_df_describe_non_temporal(scalars_dfs):
2022
# TODO: supply a reason why this isn't compatible with pandas 1.x
@@ -352,3 +354,40 @@ def test_series_groupby_describe(scalars_dfs):
352354
check_dtype=False,
353355
check_index_type=False,
354356
)
357+
358+
359+
def test_describe_json_and_obj_ref_returns_count(session):
360+
# Test describe() works on JSON and OBJ_REF types (without nunique, which fails)
361+
sql = """
362+
SELECT
363+
PARSE_JSON('{"a": 1}') AS json_col,
364+
'gs://cloud-samples-data/vision/ocr/sign.jpg' AS uri_col
365+
"""
366+
df = session.read_gbq(sql)
367+
368+
df["obj_ref_col"] = df["uri_col"].str.to_blob()
369+
df = df.drop(columns=["uri_col"])
370+
371+
res = df.describe(include="all").to_pandas()
372+
373+
assert "count" in res.index
374+
assert res.loc["count", "json_col"] == 1.0
375+
assert res.loc["count", "obj_ref_col"] == 1.0
376+
377+
378+
def test_describe_with_unsupported_type_returns_empty_dataframe(session):
379+
df = session.read_gbq("SELECT ST_GEOGPOINT(1.0, 2.0) AS geo_col")
380+
381+
res = df.describe().to_pandas()
382+
383+
assert len(res.columns) == 0
384+
assert len(res.index) == 1
385+
386+
387+
def test_describe_empty_dataframe_returns_empty_dataframe(session):
388+
df = bpd.DataFrame()
389+
390+
res = df.describe().to_pandas()
391+
392+
assert len(res.columns) == 0
393+
assert len(res.index) == 1

0 commit comments

Comments
 (0)