Configuration
Config file
Clone Catalog uses a YAML config file (default: config/clone_config.yaml). Generate a starter config with:
clxs init
Full reference
# ── Source and destination ─────────────────────────────
source_catalog: "production"
destination_catalog: "staging"
# ── SQL warehouse ─────────────────────────────────────
sql_warehouse_id: "1a86a25830e584b7"
# ── Clone settings ────────────────────────────────────
clone_type: "DEEP" # DEEP or SHALLOW
load_type: "FULL" # FULL or INCREMENTAL
# ── Managed location (required for some workspaces) ───
catalog_location: "" # e.g. "abfss://catalog@storage.dfs.core.windows.net/staging"
# API field name: `location` (the router maps `location` → `catalog_location` internally)
# ── Parallelism ───────────────────────────────────────
max_workers: 4 # Concurrent clone operations
# ── Schema filtering ─────────────────────────────────
include_schemas: [] # Empty = all schemas
exclude_schemas:
- "information_schema"
- "default"
# ── Table filtering ──────────────────────────────────
table_pattern: "" # Regex for inclusion (empty = all)
exclude_table_pattern: "" # Regex for exclusion
# ── Partial-scope selection ──────────────────────────
# Clone a specific subset of schemas + objects. Produced by the UI Scope Picker.
# Translated by the router into include_schemas + an anchored include_tables_regex,
# so it composes with include/exclude_schemas and the regex fields above.
# Volumes are enumerated per-schema and don't honor the regex — selecting a
# specific volume will include the whole schema's volumes.
include_objects:
- { schema: "sales", name: "orders", type: "table" }
- { schema: "sales", name: "customers", type: "table" }
- { schema: "marketing", name: "v_campaigns", type: "view" }
- { schema: "analytics", name: "calc_discount", type: "function" }
# ── Tag filtering ────────────────────────────────────
filter_tags: {} # e.g. { "env": "prod", "tier": "gold" }
# ── Metadata copying ────────────────────────────────
copy_permissions: true
copy_ownership: true
copy_tags: true
copy_security: true # Row filters and column masks
copy_constraints: true # PK, FK, NOT NULL
copy_comments: true
# ── Time travel ──────────────────────────────────────
timestamp_as_of: "" # e.g. "VERSION 42" or "2026-01-15T00:00:00"
# ── Rate limiting ───────────────────────────────────
rate_limit: 10 # Max SQL requests per second
# ── Retry policy ────────────────────────────────────
max_retries: 3
retry_delay: 2 # Seconds between retries
# ── Pre/post hooks ──────────────────────────────────
pre_clone_hooks: [] # SQL statements to run before clone
post_clone_hooks: [] # SQL statements to run after clone
# ── Runtime guardrails ───────────────────────────────
# Abort the clone in flight if it exceeds these limits. Checked between
# schemas. When tripped, the job's summary gets aborted=true + abort_reason.
max_duration_min: null # e.g. 60 — null = no wall-clock limit
max_tables: null # e.g. 500 — null = no table-count limit
# ── Named snapshot source ────────────────────────────
# Clone FROM a previously captured snapshot instead of the catalog's current
# state. See guide/snapshots.md. Resolved to `as_of_timestamp` at run time.
source_snapshot_id: null # UUID of a row in <audit>.clone_snapshots
# ── Cross-workspace / cross-cloud migration ─────────
# When target_workspace is set, the clone runs through the Delta Sharing + DEEP
# CLONE orchestrator (see the Cross-workspace guide). Leave unset for normal
# same-workspace clones.
#
# UI USERS: ignore this block. Saved target connections are managed in the
# browser via /settings → Target Workspaces and stored in localStorage
# (key: clxs_target_connections). The /clone payload assembly resolves the
# picked connection name into the same `target_workspace` object inline at
# request time, so the server never needs to know about saved targets and
# no PAT or client_secret is persisted on disk. This block is for users who
# drive Clone-Xs via the API directly with one-off creds.
target_workspace:
host: "" # e.g. "https://adb-target.azuredatabricks.net"
auth_method: "pat" # "pat" | "service_principal" | "profile"
token: "" # required when auth_method="pat"
client_id: "" # required when auth_method="service_principal"
client_secret: "" # required when auth_method="service_principal"
profile: "" # required when auth_method="profile"
warehouse_id: "" # target SQL warehouse for DDL + DEEP CLONE
keep_share: false # legacy/informational — leave the Delta Share intact (use cleanup_after_clone instead)
# How re-runs treat tables that already exist on the target.
# snapshot_once = CREATE TABLE IF NOT EXISTS ... DEEP CLONE (default; skip existing)
# incremental = CREATE OR REPLACE TABLE ... DEEP CLONE (mirror source; overwrites target writes)
# force_full = DROP + CREATE (full re-clone every run)
data_sync_mode: snapshot_once
# Delta Sharing refuses tables with column masks or row filters. When true,
# Clone-Xs drops them on source before adding the table to the share, then
# re-applies them on the target after the clone. For snapshot_once /
# force_full the masks are also restored on source. For incremental the
# source masks remain dropped (otherwise ongoing share reads break).
# The ADD TABLE loop also has a retry fallback: if Delta Sharing rejects
# an ADD because of a mask/filter that the upfront DESCRIBE-EXTENDED-based
# inventory missed, the loop catches the specific error, runs inventory +
# drop, and retries the ADD once.
auto_handle_masks: false
# Lifecycle of the deterministic share/recipient/shared-catalog. Default
# false means objects persist between runs so subsequent re-clones reuse
# them (true incremental sync). Set true for one-shot migrations where
# you don't intend to re-run.
cleanup_after_clone: false
# When true, re-runs also `ALTER SHARE … REMOVE TABLE` for tables that are
# in the share but no longer exist in the source. Default false because
# pruning is destructive on the share side.
prune_share_extras: false
# ⚠ Same-metastore preflight: if both source and target workspaces attach to
# the same Unity Catalog metastore, Clone-Xs refuses to run the cross-workspace
# orchestrator (Delta Sharing requires distinct metastores — you cannot share
# to yourself). Use the in-metastore clone path instead by removing
# target_workspace from your config. The check compares
# `client.metastores.summary().global_metastore_id` for both clients.
# Toggle which object types migrate cross-workspace (all default true).
# copy_permissions / copy_ownership / copy_tags above also apply.
clone_views: true # re-issue view DDL on target (catalog refs rewritten)
clone_functions: true # re-issue SQL function DDL on target
clone_volumes: true # recreate volumes and copy files via the Files API
volume_max_file_mb: 500 # skip files larger than this during volume copy
# ── Ordering ─────────────────────────────────────────
order_by_size: "desc" # "desc" = biggest first (fail fast on storage), "asc" = small tables finish early, "" = unordered
# ── Notifications ────────────────────────────────────
notifications:
slack_webhook: ""
email: ""
# ── Lineage ──────────────────────────────────────────
enable_lineage: false # Track clone lineage metadata
# ── Auto-rollback ──────────────────────────────────────
auto_rollback_on_failure: false # Trigger rollback on validation failure
rollback_threshold: 5.0 # Max mismatch % before rollback
# ── Data filtering ─────────────────────────────────────
where_clauses: # WHERE clause filters (deep clone only)
"*": "year >= 2024" # Global filter
"sales.orders": "region = 'US'" # Per-table filter
# ── Throttle controls ──────────────────────────────────
throttle: null # "low", "medium", "high", "max", or null
throttle_schedule: # Time-based throttle switching
- hours: "0-6"
profile: "high"
- hours: "9-17"
profile: "low"
# ── Checkpointing ─────────────────────────────────────
checkpoint_enabled: false
checkpoint_interval_tables: 50 # Save every N tables
checkpoint_interval_minutes: 5 # Save every N minutes
# ── Metrics ────────────────────────────────────────────
metrics_enabled: false
metrics_destination: "delta" # "delta", "json", "prometheus", "webhook"
metrics_table: "clone_audit.metrics.clone_metrics"
# ── TTL policies ───────────────────────────────────────
ttl_enabled: false
ttl_default_days: 0 # 0 = no default TTL
ttl_warn_days: 3 # Warn N days before expiry
# ── RBAC ───────────────────────────────────────────────
rbac_enabled: false
rbac_policy_path: "~/.clone-xs/rbac_policy.yaml"
# ── Approval workflows ────────────────────────────────
approval_required: false # true, false, or regex (e.g., "prod_.*")
approval_channel: "cli" # "cli" or "slack"
approval_timeout_hours: 24
# ── Impact analysis ───────────────────────────────────
impact_check_before_clone: false
impact_high_threshold: 10 # Objects count for "high" risk
# ── Compliance ─────────────────────────────────────────
compliance_report_enabled: false
compliance_retention_days: 90
# ── Plugins ────────────────────────────────────────────
plugin_dir: "~/.clone-xs/plugins"
auto_load_plugins: true
# ── Scheduling ─────────────────────────────────────────
schedule_interval: null # e.g., "6h"
schedule_cron: null # e.g., "0 */6 * * *"
drift_check_before_clone: true
# ── Logging & Audit ────────────────────────────────────
save_run_logs: true # Persist run logs to Delta (default: true)
metrics_enabled: false # Persist metrics to Delta (default: false)
metrics_table: "clone_audit.metrics.clone_metrics"
audit_trail:
catalog: clone_audit # Delta catalog for audit tables
schema: logs # Schema name
table: clone_operations # Audit trail table name
# ── API server ─────────────────────────────────────────
api_port: 8080
api_host: "0.0.0.0"
api_key: null # Set to enable API auth
Config profiles
Define multiple profiles in the same config file, then select one at runtime:
profiles:
dev:
source_catalog: "production"
destination_catalog: "dev_sandbox"
clone_type: "SHALLOW"
max_workers: 2
staging:
source_catalog: "production"
destination_catalog: "staging"
clone_type: "DEEP"
max_workers: 8
copy_permissions: true
# Use the staging profile
clxs clone --profile staging
Clone options reference
Every field in the Clone page's Options step (and the matching YAML key). Hovering the info icon next to each field in the UI shows the same description.
Clone type & load type
| Field | YAML key | Description |
|---|---|---|
| Clone Type | clone_type | DEEP copies all data files into the destination (independent of source). SHALLOW only copies metadata — destination points at source files. |
| Load Type | load_type | FULL re-clones every table on each run. INCREMENTAL only clones tables whose Delta version advanced since the last run. SELECTIVE re-clones only tables that drifted since the last clone — see Selective re-clone in the clone guide. |
| Dry Run | dry_run | Render every SQL statement but don't execute. Outputs the same logs and metrics as a real run, prefixed with [DRY RUN]. Use for review before executing destructive operations. Default false. |
Target format (Iceberg)
| Field | YAML key | Description |
|---|---|---|
| Target Format | target_format | DELTA (default) or ICEBERG. With ICEBERG, the destination stays Delta but UniForm metadata (delta.universalFormat.enabledFormats=iceberg + IcebergCompatV2 + columnMapping=name) is enabled post-clone so external Iceberg engines can read it. Only effective on Delta sources — non-Delta sources skip with a WARN. |
| Physical Iceberg | iceberg_physical | Only meaningful with target_format: ICEBERG. When true, swaps the UniForm path for CREATE TABLE … USING iceberg AS SELECT … so UC reports the destination as Data source: Iceberg. Loses Delta history, ignores time-travel arguments, requires DBR 15+ with Iceberg-managed-table support. Default false. |
Compute
| Field | YAML key | Description |
|---|---|---|
| Use Serverless Compute | serverless | Run the clone as a serverless Databricks job instead of against a SQL warehouse. Zero-warehouse cost for one-offs and CI. |
| UC Volume | volume | Unity Catalog volume path where Clone-Xs uploads itself for the serverless job to execute. |
Performance
| Field | YAML key | Description |
|---|---|---|
| Max Workers (schemas) | max_workers | Number of schemas processed in parallel. Raise for wide catalogs; each worker holds one warehouse slot. |
| Parallel Tables | parallel_tables | Tables cloned in parallel within a single schema. Raise for many small tables; lower for big tables on a shared warehouse. |
| Max Parallel Queries | max_parallel_queries | Upper bound on concurrent SQL statements across all workers. Prevents warehouse saturation. |
| Max RPS | max_rps | Rate-limit statements per second across all workers. 0 = unlimited. Use to protect shared upstream systems. |
| Order by Size | order_by_size | Clone order by table byte-size. desc = biggest first (fails fast on storage issues); asc = small tables finish early. |
| Throttle Profile | throttle | Pre-defined throughput profile. low = minimal warehouse load; max = no self-limiting. Overrides parallel_tables and max_parallel_queries. |
Copy options
Controls which metadata flows from source to destination. All default to true.
| Field | YAML key | Description |
|---|---|---|
| Permissions | copy_permissions | Copy Unity Catalog grants (SELECT, MODIFY, etc.) from source objects to destination. |
| Ownership | copy_ownership | Set the destination object's OWNER to match the source. |
| Tags | copy_tags | Copy Unity Catalog tags (key-value annotations) on catalogs, schemas, tables, and columns. |
| Properties | copy_properties | Copy Delta table properties (delta.autoOptimize, delta.minReaderVersion, etc.). |
| Security | copy_security | Copy row filters and column masks attached to source tables. |
| Constraints | copy_constraints | Copy NOT NULL and CHECK constraints from source tables. |
| Comments | copy_comments | Copy table and column comments. |
Features
| Field | YAML key | Description |
|---|---|---|
| Enable Rollback | enable_rollback | Write a rollback manifest so clxs rollback can undo the clone later. |
| Auto Rollback on Fail | auto_rollback | Automatically trigger rollback if post-clone validation detects more mismatches than rollback_threshold. |
| Validate After Clone | validate_after_clone | Run row-count validation after each table clone completes. |
| Checksum Validation | validate_checksum | Use SHA-256 over hashed columns in addition to row counts. Slower but catches silent data drift. |
| Force Re-clone | force_reclone | Drop and recreate destination tables even when they already exist. Otherwise existing tables are skipped. |
| Schema Only | schema_only | Create destination schemas + empty tables but skip the actual data copy. Useful for schema-migration dry runs. |
| Generate Report | generate_report | Emit an HTML audit report summarising what was cloned, mismatches, and timings. |
| Show Progress | show_progress | Render live progress bars in the CLI / job logs. |
| Enable Checkpoint | checkpoint | Persist per-table progress to a checkpoint file so interrupted clones can resume where they left off. |
| Require Approval | require_approval | Pause the job before any write operation and wait for manual approval in the UI. |
| Impact Check | impact_check | Pre-flight scan for downstream dependencies (views, jobs, dashboards) that reference the destination. |
| Skip Unused Tables | skip_unused | Skip tables with zero recent usage in system.access.table_lineage. Trims the scope of dev-refresh jobs. |
| Verbose Logging | verbose | Emit DEBUG-level logs for every SQL statement. Large output volume — use for troubleshooting only. |
| Rollback Threshold (%) | rollback_threshold | Maximum percentage of row mismatches tolerated before Auto Rollback on Fail kicks in. Only shown when auto-rollback is enabled. |
| Auto-mask PII | auto_mask_pii | Auto-detect PII columns via UC column_tags (EMAIL / SSN / CREDIT_CARD / PHONE) and mask them on the destination via the existing src/masking.py pipeline. Masking runs as a post-clone UPDATE — bounded exposure window. Default false. |
| Auto-retry transient failures | enable_retry | Retry transient clone failures (network, throttle, HTTP 429 / 5xx) with exponential backoff. Logical errors (schema mismatch, permission, validation) never retry. Bounded by max_retries. Default true. |
| Compare DQ after clone | compare_dq_after_clone | Run a column-level DQ comparison after each schema clones — row count + per-column NULL counts on source vs target. Combined with auto_rollback, max-drift exceeding dq_drift_rollback_pct triggers Delta RESTORE. Adds one warehouse round-trip per cloned table. Default false. |
| DQ drift rollback % | dq_drift_rollback_pct | Drift threshold (0–100) for compare_dq_after_clone. Default 5.0 — matches the row-count rollback_threshold so operators have one mental model for "acceptable drift." |
| Pre-clone source quiesce | quiesce_source | Snapshot + revoke write privileges on the source schemas at clone start, re-grant in a finally block at clone end. Prevents concurrent writes from landing mid-clone (silent partial-time-travel divergence between tables). Default false. See clone guide — pre-clone quiesce. |
Filtering
| Field | YAML key | Description |
|---|---|---|
| Include Schemas | include_schemas | Comma-separated schema names to clone. Empty = all schemas (minus excludes). |
| Exclude Schemas | exclude_schemas | Comma-separated schemas to skip. information_schema and default are excluded by default. |
| Include Tables Regex | include_tables_regex | Only tables + views whose name matches this regex are cloned. Applies after include/exclude schemas. |
| Exclude Tables Regex | exclude_tables_regex | Tables + views whose name matches this regex are skipped. Takes precedence over include regex. |
Time travel
| Field | YAML key | Description |
|---|---|---|
| As-of Timestamp | as_of_timestamp | Clone each source table as it existed at this timestamp. Requires the source's Delta version retention to cover this point. |
| As-of Version | as_of_version | Clone each source table at this specific Delta transaction version. |
Advanced
| Field | YAML key | Description |
|---|---|---|
| WHERE Clauses | where_clauses | Dict, not a single string. Per-table row predicates keyed by "schema.table" (or "*" wildcard for all). Forces the matched per-table CLONE to a CTAS path (CREATE TABLE … AS SELECT * FROM src WHERE …) — loses Delta source history. DEEP-only; ignored on SHALLOW with a WARN. Example: {"bronze.events": "date >= '2026-01-01'", "*": "is_deleted = false"}. |
| TBLPROPERTIES override | clone_tbl_properties | Dict of TBLPROPERTIES (...) rendered onto every per-table CLONE statement. Required for properties that must be on the first commit (e.g. delta.logRetentionDuration, delta.deletedFileRetentionDuration for archival retention) — setting these via post-clone ALTER TABLE is too late. Single quotes in values are SQL-escaped by doubling. |
| TTL | ttl | Auto-expiry for the destination catalog (e.g. 7d, 30d, 2w). A background cleanup job drops expired catalogs. |
| Template | template | Named config preset (e.g. dev-refresh, dr-replica) that overrides common flags. See clxs templates list for available presets. |
| Max duration (min) | max_duration_min | Runtime guardrail — abort the clone if wall-clock exceeds this many minutes. null = no limit. Checked between schemas. |
| Max tables | max_tables | Runtime guardrail — abort after this many tables have been touched (success/failed/skipped). Safety net against runaway scope changes. |
| Source snapshot ID | source_snapshot_id | Clone from a named snapshot instead of the catalog's current state. The snapshot's captured_at is applied as as_of_timestamp. See Clone Snapshots. |
| Databricks CLI profile | profile | Optional ~/.databrickscfg profile name to authenticate with. Distinct from target_workspace.profile (which selects a profile for cross-workspace auth). Falls back to the default profile when unset. |
| Scope picker selection | include_objects | Granular per-object selection — list of {schema, name, type} records where type is table, view, function, or volume. Translated by the API router into include_schemas + an anchored include_tables_regex. Use when the wizard's Scope Picker is in "Select schemas + objects" mode. |
| Multi-target fanout | target_workspaces | List of target_workspace objects (same shape as the singular field). When set, routes to the multi-target fanout orchestrator — runs N cross-workspace clones in parallel, one per target. Mutually exclusive with target_workspace (Pydantic XOR validator rejects setting both). |
| Fanout max parallel | fanout_max_parallel | Cap on simultaneous target clones in fanout mode. Higher values increase source-side egress bandwidth pressure; lower values serialize. Default 5 matches typical N-region DR fanout (us, eu, apac, etc.). |
CLI overrides
CLI flags override config file values. For example:
# Config says DEEP but CLI overrides to SHALLOW
clxs clone --clone-type SHALLOW
Environment variables
Environment variables override config file values for auth settings:
| Variable | Purpose |
|---|---|
DATABRICKS_HOST | Workspace URL |
DATABRICKS_TOKEN | Personal access token |
DATABRICKS_CLIENT_ID | OAuth service principal client ID |
DATABRICKS_CLIENT_SECRET | OAuth service principal secret |
AZURE_CLIENT_ID | Azure AD service principal client ID |
AZURE_CLIENT_SECRET | Azure AD service principal secret |
AZURE_TENANT_ID | Azure AD tenant ID |
DATABRICKS_CONFIG_PROFILE | Default CLI profile name |
CLXS_CACHE_TTL | Metadata cache TTL in seconds (default: 300) |
RTBF (Right to Be Forgotten)
rtbf:
enabled: true # Enable RTBF feature
default_strategy: delete # delete | anonymize | pseudonymize
deadline_days: 30 # GDPR 30-day deadline
default_grace_period_days: 0 # Days to wait before execution
auto_vacuum: true # Auto-VACUUM after deletion
vacuum_retention_hours: 0 # 0 = remove all Delta history
require_approval: true # Require manual approval
verification_required: true # Require verification pass
certificate_auto_generate: true # Auto-generate certificate
certificate_output_dir: reports/rtbf
exclude_schemas:
- information_schema
- default
| Key | Default | Description |
|---|---|---|
enabled | true | Enable/disable RTBF feature |
default_strategy | delete | Default deletion strategy |
deadline_days | 30 | GDPR deadline in days |
default_grace_period_days | 0 | Default grace period before execution |
auto_vacuum | true | Automatically VACUUM after deletion |
vacuum_retention_hours | 0 | VACUUM retention (0 = aggressive) |
require_approval | true | Require approval before execution |
verification_required | true | Require verification before completion |
certificate_auto_generate | true | Auto-generate certificates |
certificate_output_dir | reports/rtbf | Output directory for certificates |
exclude_schemas | [information_schema, default] | Schemas to skip during discovery |
DSAR (Data Subject Access Request)
dsar:
deadline_days: 30 # GDPR 30-day requirement
default_export_format: csv # csv | json | parquet
export_output_dir: reports/dsar
require_approval: true
Clone Pipelines
pipelines:
max_concurrent_steps: 1 # sequential execution
default_on_failure: abort # abort | skip | retry
retry_max_attempts: 3
retry_backoff_seconds: 30
Data Observability
observability:
health_score_weights:
freshness: 0.25
volume: 0.15
anomaly: 0.20
sla: 0.25
dq: 0.15
issue_lookback_hours: 24
trend_days: 30
Config diff
Compare two config files to see differences:
clxs config-diff config/dev.yaml config/staging.yaml