Coverage for src/chuck_data/commands/tag_pii.py: 0%

64 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-06-05 22:56 -0700

1""" 

2Command handler for PII column tagging. 

3 

4This module contains the handler for applying semantic tags to columns 

5containing Personally Identifiable Information (PII) in a table 

6using SQL commands. It applies tags to columns identified by the scan_pii command 

7rather than performing its own PII scanning. 

8""" 

9 

10import logging 

11from typing import Optional, Dict, Any, List 

12 

13from ..clients.databricks import DatabricksAPIClient 

14from ..command_registry import CommandDefinition 

15from ..config import get_warehouse_id, get_active_catalog, get_active_schema 

16from .base import CommandResult 

17 

18# No need to import PII logic for scanning, we just apply the tags 

19 

20 

21def handle_command(client: Optional[DatabricksAPIClient], **kwargs) -> CommandResult: 

22 """ 

23 Apply PII semantic tags to columns in a table using SQL. 

24 

25 This command doesn't perform PII scanning itself - it applies the tags that were 

26 identified by the scan_pii command. 

27 

28 Args: 

29 client: API client instance 

30 **kwargs: 

31 table_name (str): Name of the table to tag 

32 pii_columns (list): List of columns with PII semantic info 

33 """ 

34 table_name: str = kwargs.get("table_name") 

35 pii_columns: List[Dict[str, Any]] = kwargs.get("pii_columns", []) 

36 

37 if not table_name: 

38 return CommandResult(False, message="table_name parameter is required.") 

39 if not pii_columns: 

40 return CommandResult( 

41 False, message="pii_columns parameter is required with columns to tag." 

42 ) 

43 if not client: 

44 return CommandResult(False, message="Client is required for PII tagging.") 

45 

46 try: 

47 # Get warehouse ID from config for SQL execution 

48 warehouse_id = get_warehouse_id() 

49 if not warehouse_id: 

50 return CommandResult( 

51 False, 

52 message="No warehouse ID configured. Use /warehouse command to select a SQL warehouse first.", 

53 ) 

54 

55 # Get active catalog and schema from config if needed for table name resolution 

56 catalog_name = get_active_catalog() 

57 schema_name = get_active_schema() 

58 

59 # Resolve the full table name 

60 full_table_name = table_name 

61 if "." not in table_name: 

62 if not catalog_name or not schema_name: 

63 return CommandResult( 

64 False, 

65 message="No active catalog and schema selected. Use /catalog and /schema commands first, or provide a fully qualified table name.", 

66 ) 

67 full_table_name = f"{catalog_name}.{schema_name}.{table_name}" 

68 

69 # Validate the table exists 

70 try: 

71 table_info = client.get_table(full_name=full_table_name) 

72 if not table_info: 

73 return CommandResult( 

74 False, message=f"Table {full_table_name} not found." 

75 ) 

76 

77 # Extract the actual full name from the table info 

78 full_table_name = table_info.get("full_name", full_table_name) 

79 table_name_only = full_table_name.split(".")[-1] 

80 column_count = len(table_info.get("columns", [])) 

81 

82 except Exception as e: 

83 return CommandResult( 

84 False, message=f"Failed to retrieve table details: {str(e)}" 

85 ) 

86 

87 # Apply tags for each provided PII column 

88 tagging_results = apply_semantic_tags( 

89 client, full_table_name, pii_columns, warehouse_id 

90 ) 

91 

92 # Prepare the result dictionary with table info 

93 pii_result_dict = { 

94 "table_name": table_name_only, 

95 "full_name": full_table_name, 

96 "column_count": column_count, 

97 "pii_column_count": len(pii_columns), 

98 "pii_columns": pii_columns, 

99 "tagging_results": tagging_results, 

100 } 

101 

102 successfully_tagged = sum(1 for r in tagging_results if r.get("success", False)) 

103 

104 msg = f"Applied semantic tags to {successfully_tagged} of {len(pii_columns)} columns in {table_name_only}" 

105 return CommandResult(True, data=pii_result_dict, message=msg) 

106 except Exception as e: 

107 logging.error(f"handle_tag_pii error for '{table_name}': {e}", exc_info=True) 

108 return CommandResult( 

109 False, error=e, message=f"Unexpected error in PII tagging: {str(e)}" 

110 ) 

111 

112 

113def apply_semantic_tags( 

114 client: DatabricksAPIClient, 

115 full_table_name: str, 

116 pii_columns: List[Dict[str, Any]], 

117 warehouse_id: str, 

118) -> List[Dict[str, Any]]: 

119 """ 

120 Apply semantic tags to columns using SQL ALTER TABLE statements. 

121 

122 Args: 

123 client: DatabricksAPIClient instance 

124 full_table_name: Full qualified table name (catalog.schema.table) 

125 pii_columns: List of columns with semantic tag information 

126 warehouse_id: ID of the SQL warehouse to execute statements 

127 

128 Returns: 

129 List of result dictionaries for each tagging operation 

130 """ 

131 tagging_results = [] 

132 

133 for column in pii_columns: 

134 column_name = column.get("name") 

135 semantic_type = column.get("semantic") 

136 

137 if not column_name or not semantic_type: 

138 tagging_results.append( 

139 { 

140 "column": column_name or "unknown", 

141 "success": False, 

142 "error": "Missing column name or semantic type", 

143 } 

144 ) 

145 continue 

146 

147 # Construct and execute the SQL ALTER TABLE statement 

148 sql = f""" 

149 ALTER TABLE {full_table_name}  

150 ALTER COLUMN {column_name}  

151 SET TAGS ('semantic' = '{semantic_type}') 

152 """ 

153 

154 try: 

155 logging.info(f"Applying tag '{semantic_type}' to column '{column_name}'") 

156 result = client.submit_sql_statement( 

157 sql_text=sql, warehouse_id=warehouse_id, wait_timeout="30s" 

158 ) 

159 

160 if result.get("status", {}).get("state") == "SUCCEEDED": 

161 tagging_results.append( 

162 { 

163 "column": column_name, 

164 "semantic_type": semantic_type, 

165 "success": True, 

166 } 

167 ) 

168 else: 

169 error = ( 

170 result.get("status", {}) 

171 .get("error", {}) 

172 .get("message", "Unknown error") 

173 ) 

174 tagging_results.append( 

175 { 

176 "column": column_name, 

177 "semantic_type": semantic_type, 

178 "success": False, 

179 "error": error, 

180 } 

181 ) 

182 except Exception as e: 

183 logging.error(f"Error applying tag to {column_name}: {str(e)}") 

184 tagging_results.append( 

185 { 

186 "column": column_name, 

187 "semantic_type": semantic_type, 

188 "success": False, 

189 "error": str(e), 

190 } 

191 ) 

192 

193 return tagging_results 

194 

195 

196DEFINITION = CommandDefinition( 

197 name="tag-pii-columns", 

198 description="Apply semantic tags to columns identified by the scan_pii command", 

199 handler=handle_command, 

200 parameters={ 

201 "table_name": { 

202 "type": "string", 

203 "description": "Name of the table to tag (can be fully qualified or just the table name)", 

204 }, 

205 "pii_columns": { 

206 "type": "array", 

207 "description": "List of columns with PII information in format [{'name': 'colname', 'semantic': 'pii-type'}]", 

208 }, 

209 }, 

210 required_params=["table_name", "pii_columns"], 

211 tui_aliases=["/tag-pii"], 

212 visible_to_user=True, 

213 visible_to_agent=True, 

214 usage_hint='Example: /tag-pii --table_name my_table --pii_columns \'[{"name": "email", "semantic": "email"}]\'', 

215)