import xorq as xo
# Connect to embedded backend
= xo.connect()
con print(f"Connected to: {con}")
Connected to: <xorq.backends.let.Backend object at 0x7f81607c5c70>
Xorq supports various backends through optional dependencies:
# For DuckDB backend
pip install "xorq[duckdb]"
# For Snowflake backend
pip install "xorq[snowflake]"
# For PostgreSQL backend
pip install "xorq[postgres]"
# For PyIceberg backend
pip install "xorq[pyiceberg]"
# For DataFusion backend
pip install "xorq[datafusion]"
# Install all optional dependencies
pip install "xorq[examples,duckdb,snowflake,postgres,pyiceberg,datafusion]"
Xorq supports multiple backends for different use cases. Here’s how to configure and connect to each:
The embedded backend uses Xorq’s modified DataFusion engine optimized for Arrow UDF execution.
Good for local development and small datasets.
For production workloads with PostgreSQL databases.
import xorq as xo
# Method 1: Using environment variables
# Set these environment variables:
# POSTGRES_HOST, POSTGRES_PORT, POSTGRES_DATABASE,
# POSTGRES_USER, POSTGRES_PASSWORD
pg_con = xo.postgres.connect_env()
# Method 2: Direct connection
pg_con = xo.postgres.connect(
host="localhost",
port=5432,
database="your_database",
user="your_user",
password="your_password"
)
# Example usage
batting_table = pg_con.table("batting")
For analytical workloads with DuckDB’s columnar engine.
For cloud data warehouse operations.
import xorq as xo
# Connect to embedded backend
con = xo.connect()
# Load iris dataset
iris = xo.examples.iris.fetch(backend=con)
# Basic operations
filtered = iris.filter(xo._.sepal_length > 5)
grouped = filtered.group_by("species").agg(xo._.sepal_width.sum())
# Execute the query
result = grouped.execute()
print(result)
species Sum(sepal_width)
0 Versicolor 131.8
1 Setosa 81.7
2 Virginica 146.2
import xorq as xo
from sklearn.neighbors import KNeighborsClassifier
from xorq.expr.ml.pipeline_lib import Step
# Load penguins dataset
con = xo.connect()
penguins = xo.examples.penguins.fetch(backend=con)
filtered_penguins = penguins.filter(
penguins.bill_length_mm.isnull() == False,
penguins.bill_depth_mm.isnull() == False,
penguins.flipper_length_mm.isnull() == False,
penguins.body_mass_g.isnull() == False,
)
# Define features and target
features = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
target = 'species'
# Create and fit ML step
step = Step(KNeighborsClassifier)
fitted = step.fit(filtered_penguins, features=features, target=target)
# Make predictions
predictions = filtered_penguins.mutate(
predicted=fitted.deferred_predict.on_expr
)
result = predictions.execute()
print(result)
species island bill_length_mm bill_depth_mm flipper_length_mm \
0 Adelie Torgersen 39.1 18.7 181.0
1 Adelie Torgersen 39.5 17.4 186.0
2 Adelie Torgersen 40.3 18.0 195.0
3 Adelie Torgersen 36.7 19.3 193.0
4 Adelie Torgersen 39.3 20.6 190.0
.. ... ... ... ... ...
337 Chinstrap Dream 55.8 19.8 207.0
338 Chinstrap Dream 43.5 18.1 202.0
339 Chinstrap Dream 49.6 18.2 193.0
340 Chinstrap Dream 50.8 19.0 210.0
341 Chinstrap Dream 50.2 18.7 198.0
body_mass_g sex year predicted
0 3750.0 male 2007 Adelie
1 3800.0 female 2007 Adelie
2 3250.0 female 2007 Chinstrap
3 3450.0 female 2007 Adelie
4 3650.0 male 2007 Chinstrap
.. ... ... ... ...
337 4000.0 male 2009 Adelie
338 3400.0 female 2009 Chinstrap
339 3775.0 male 2009 Chinstrap
340 4100.0 male 2009 Adelie
341 3775.0 female 2009 Chinstrap
[342 rows x 9 columns]
import xorq as xo
from xorq.caching import SourceStorage
con = xo.connect()
pg = xo.postgres.connect_env()
# Create a cached expression
expr = (
pg.table("large_table")
.filter(xo._.date >= "2024-01-01")
.group_by("category")
.agg(total=xo._.amount.sum())
.cache(SourceStorage(source=con)) # Cache results
)
# First execution computes and caches
result1 = expr.execute()
# Second execution uses cached result
result2 = expr.execute() # Much faster!