Coverage for src/chuck_data/commands/tag_pii.py: 0%
64 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-06-05 22:56 -0700
« prev ^ index » next coverage.py v7.8.0, created at 2025-06-05 22:56 -0700
1"""
2Command handler for PII column tagging.
4This module contains the handler for applying semantic tags to columns
5containing Personally Identifiable Information (PII) in a table
6using SQL commands. It applies tags to columns identified by the scan_pii command
7rather than performing its own PII scanning.
8"""
10import logging
11from typing import Optional, Dict, Any, List
13from ..clients.databricks import DatabricksAPIClient
14from ..command_registry import CommandDefinition
15from ..config import get_warehouse_id, get_active_catalog, get_active_schema
16from .base import CommandResult
18# No need to import PII logic for scanning, we just apply the tags
21def handle_command(client: Optional[DatabricksAPIClient], **kwargs) -> CommandResult:
22 """
23 Apply PII semantic tags to columns in a table using SQL.
25 This command doesn't perform PII scanning itself - it applies the tags that were
26 identified by the scan_pii command.
28 Args:
29 client: API client instance
30 **kwargs:
31 table_name (str): Name of the table to tag
32 pii_columns (list): List of columns with PII semantic info
33 """
34 table_name: str = kwargs.get("table_name")
35 pii_columns: List[Dict[str, Any]] = kwargs.get("pii_columns", [])
37 if not table_name:
38 return CommandResult(False, message="table_name parameter is required.")
39 if not pii_columns:
40 return CommandResult(
41 False, message="pii_columns parameter is required with columns to tag."
42 )
43 if not client:
44 return CommandResult(False, message="Client is required for PII tagging.")
46 try:
47 # Get warehouse ID from config for SQL execution
48 warehouse_id = get_warehouse_id()
49 if not warehouse_id:
50 return CommandResult(
51 False,
52 message="No warehouse ID configured. Use /warehouse command to select a SQL warehouse first.",
53 )
55 # Get active catalog and schema from config if needed for table name resolution
56 catalog_name = get_active_catalog()
57 schema_name = get_active_schema()
59 # Resolve the full table name
60 full_table_name = table_name
61 if "." not in table_name:
62 if not catalog_name or not schema_name:
63 return CommandResult(
64 False,
65 message="No active catalog and schema selected. Use /catalog and /schema commands first, or provide a fully qualified table name.",
66 )
67 full_table_name = f"{catalog_name}.{schema_name}.{table_name}"
69 # Validate the table exists
70 try:
71 table_info = client.get_table(full_name=full_table_name)
72 if not table_info:
73 return CommandResult(
74 False, message=f"Table {full_table_name} not found."
75 )
77 # Extract the actual full name from the table info
78 full_table_name = table_info.get("full_name", full_table_name)
79 table_name_only = full_table_name.split(".")[-1]
80 column_count = len(table_info.get("columns", []))
82 except Exception as e:
83 return CommandResult(
84 False, message=f"Failed to retrieve table details: {str(e)}"
85 )
87 # Apply tags for each provided PII column
88 tagging_results = apply_semantic_tags(
89 client, full_table_name, pii_columns, warehouse_id
90 )
92 # Prepare the result dictionary with table info
93 pii_result_dict = {
94 "table_name": table_name_only,
95 "full_name": full_table_name,
96 "column_count": column_count,
97 "pii_column_count": len(pii_columns),
98 "pii_columns": pii_columns,
99 "tagging_results": tagging_results,
100 }
102 successfully_tagged = sum(1 for r in tagging_results if r.get("success", False))
104 msg = f"Applied semantic tags to {successfully_tagged} of {len(pii_columns)} columns in {table_name_only}"
105 return CommandResult(True, data=pii_result_dict, message=msg)
106 except Exception as e:
107 logging.error(f"handle_tag_pii error for '{table_name}': {e}", exc_info=True)
108 return CommandResult(
109 False, error=e, message=f"Unexpected error in PII tagging: {str(e)}"
110 )
113def apply_semantic_tags(
114 client: DatabricksAPIClient,
115 full_table_name: str,
116 pii_columns: List[Dict[str, Any]],
117 warehouse_id: str,
118) -> List[Dict[str, Any]]:
119 """
120 Apply semantic tags to columns using SQL ALTER TABLE statements.
122 Args:
123 client: DatabricksAPIClient instance
124 full_table_name: Full qualified table name (catalog.schema.table)
125 pii_columns: List of columns with semantic tag information
126 warehouse_id: ID of the SQL warehouse to execute statements
128 Returns:
129 List of result dictionaries for each tagging operation
130 """
131 tagging_results = []
133 for column in pii_columns:
134 column_name = column.get("name")
135 semantic_type = column.get("semantic")
137 if not column_name or not semantic_type:
138 tagging_results.append(
139 {
140 "column": column_name or "unknown",
141 "success": False,
142 "error": "Missing column name or semantic type",
143 }
144 )
145 continue
147 # Construct and execute the SQL ALTER TABLE statement
148 sql = f"""
149 ALTER TABLE {full_table_name}
150 ALTER COLUMN {column_name}
151 SET TAGS ('semantic' = '{semantic_type}')
152 """
154 try:
155 logging.info(f"Applying tag '{semantic_type}' to column '{column_name}'")
156 result = client.submit_sql_statement(
157 sql_text=sql, warehouse_id=warehouse_id, wait_timeout="30s"
158 )
160 if result.get("status", {}).get("state") == "SUCCEEDED":
161 tagging_results.append(
162 {
163 "column": column_name,
164 "semantic_type": semantic_type,
165 "success": True,
166 }
167 )
168 else:
169 error = (
170 result.get("status", {})
171 .get("error", {})
172 .get("message", "Unknown error")
173 )
174 tagging_results.append(
175 {
176 "column": column_name,
177 "semantic_type": semantic_type,
178 "success": False,
179 "error": error,
180 }
181 )
182 except Exception as e:
183 logging.error(f"Error applying tag to {column_name}: {str(e)}")
184 tagging_results.append(
185 {
186 "column": column_name,
187 "semantic_type": semantic_type,
188 "success": False,
189 "error": str(e),
190 }
191 )
193 return tagging_results
196DEFINITION = CommandDefinition(
197 name="tag-pii-columns",
198 description="Apply semantic tags to columns identified by the scan_pii command",
199 handler=handle_command,
200 parameters={
201 "table_name": {
202 "type": "string",
203 "description": "Name of the table to tag (can be fully qualified or just the table name)",
204 },
205 "pii_columns": {
206 "type": "array",
207 "description": "List of columns with PII information in format [{'name': 'colname', 'semantic': 'pii-type'}]",
208 },
209 },
210 required_params=["table_name", "pii_columns"],
211 tui_aliases=["/tag-pii"],
212 visible_to_user=True,
213 visible_to_agent=True,
214 usage_hint='Example: /tag-pii --table_name my_table --pii_columns \'[{"name": "email", "semantic": "email"}]\'',
215)