>>> import pandas as pd
>>> from xorq.expr.udf import make_pandas_udf
>>> import xorq.expr.datatypes as dt
>>> import xorq as xo
make_pandas_udf
xorq.expr.udf.make_pandas_udf(
fn,
schema,
return_type,=None,
database=None,
catalog=None,
name**kwargs,
)
Create a scalar User-Defined Function (UDF) that operates on pandas DataFrames.
This function creates a scalar UDF that processes data row-by-row, converting PyArrow arrays to pandas DataFrames for processing. It’s ideal for operations that benefit from pandas’ rich functionality and are easier to express with DataFrame operations.
Parameters
Name | Type | Description | Default |
---|---|---|---|
fn | callable | The function to be executed. Should accept a pandas DataFrame and return a pandas Series or scalar value. | required |
schema | Schema | The input schema defining column names and their data types. | required |
return_type | DataType | The return data type of the UDF. | required |
database | str | Database name for the UDF namespace. | None |
catalog | str | Catalog name for the UDF namespace. | None |
name | str | Name of the UDF. If None, generates a name from the function. | None |
**kwargs | Additional configuration parameters (e.g., volatility settings). | {} |
Returns
Name | Type | Description |
---|---|---|
callable | A UDF constructor that can be used in expressions with .on_expr() method. |
Examples
Creating a UDF that calculates penguin bill ratio:
>>> # Load penguins dataset
>>> penguins = xo.examples.penguins.fetch(backend=xo.connect())
>>> # Define the function
>>> def bill_ratio(df):
return df['bill_length_mm'] / df['bill_depth_mm'] ...
>>> # Create UDF
>>> schema = penguins.select(['bill_length_mm', 'bill_depth_mm']).schema()
>>> bill_ratio_udf = make_pandas_udf(
=bill_ratio,
... fn=schema,
... schema=dt.float64,
... return_type="bill_ratio"
... name>>> )
>>> # Apply to table
>>> result = penguins.mutate(
=bill_ratio_udf.on_expr(penguins)
... bill_ratio>>> ).execute()
Creating a UDF for penguin size classification:
>>> def classify_penguin_size(df):
def size_category(row):
... = row['body_mass_g']
... mass = row['flipper_length_mm']
... flipper
...if pd.isna(mass) or pd.isna(flipper):
... return 'Unknown'
...
...# Simple size classification based on body mass and flipper length
... if mass > 4500 and flipper > 210:
... return 'Large'
... elif mass < 3500 and flipper < 190:
... return 'Small'
... else:
... return 'Medium'
...
...return df.apply(size_category, axis=1) ...
>>> size_schema = penguins.select(['body_mass_g', 'flipper_length_mm']).schema()
>>> size_udf = make_pandas_udf(
=classify_penguin_size,
... fn=size_schema,
... schema=dt.string,
... return_type="classify_size"
... name>>> )
>>> # Apply size classification
>>> result = penguins.mutate(
=size_udf.on_expr(penguins)
... size_category>>> ).execute()
Creating a UDF for complex penguin feature engineering:
>>> def penguin_features(df):
# Create multiple derived features
... = pd.DataFrame(index=df.index)
... features
...# Bill area
... 'bill_area'] = df['bill_length_mm'] * df['bill_depth_mm']
... features[
...# Body condition index
... 'body_condition'] = df['body_mass_g'] / (df['flipper_length_mm'] ** 2)
... features[
...# Aspect ratio of bill
... 'bill_aspect_ratio'] = df['bill_length_mm'] / df['bill_depth_mm']
... features[
...# Return as concatenated string for this example
... return features.apply(lambda row: f"area:{row['bill_area']:.1f}_bci:{row['body_condition']:.4f}_ratio:{row['bill_aspect_ratio']:.2f}", axis=1) ...
>>> all_measurements = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
>>> features_schema = penguins.select(all_measurements).schema()
>>> features_udf = make_pandas_udf(
=penguin_features,
... fn=features_schema,
... schema=dt.string,
... return_type="penguin_features"
... name>>> )
>>> # Apply feature engineering
>>> result = penguins.mutate(
=features_udf.on_expr(penguins)
... derived_features>>> ).execute()
Notes
- The function receives a pandas DataFrame where columns correspond to the schema keys
- The function should return a pandas Series or scalar value compatible with return_type
- PyArrow arrays are automatically converted to pandas and back for seamless integration
- Use this when you need pandas-specific functionality like string operations, datetime handling, or complex data manipulations
See Also
scalar : For PyArrow-based scalar UDFs with potentially better performance make_pandas_expr_udf : For UDFs that need pre-computed values agg : For aggregation functions