Coverage for src/commands/sql_external_data.py: 0%
88 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-06-05 22:56 -0700
« prev ^ index » next coverage.py v7.8.0, created at 2025-06-05 22:56 -0700
1"""
2Utility for fetching and parsing external SQL result data from Databricks.
4When SQL queries return large result sets, Databricks provides external_links
5to CSV files containing the data. This module handles fetching and parsing
6that external data.
7"""
9import csv
10import io
11import logging
12import requests
13from typing import List, Dict, Any, Optional
14from urllib.parse import urlparse
17def fetch_external_data(external_link: str, timeout: int = 30) -> List[List[str]]:
18 """
19 Fetch CSV data from an external link and parse it into rows.
21 Args:
22 external_link: URL to fetch CSV data from
23 timeout: Request timeout in seconds
25 Returns:
26 List of rows, where each row is a list of string values
28 Raises:
29 requests.RequestException: If HTTP request fails
30 csv.Error: If CSV parsing fails
31 """
32 try:
33 logging.debug(f"Fetching external SQL data from: {external_link}")
35 # Validate URL
36 parsed_url = urlparse(external_link)
37 if not parsed_url.scheme or not parsed_url.netloc:
38 raise ValueError(f"Invalid URL: {external_link}")
40 # Fetch the CSV data
41 response = requests.get(external_link, timeout=timeout)
42 response.raise_for_status()
44 # Parse CSV data
45 csv_content = response.text
46 csv_reader = csv.reader(io.StringIO(csv_content))
47 rows = list(csv_reader)
49 logging.debug(f"Successfully fetched {len(rows)} rows from external link")
50 return rows
52 except requests.RequestException as e:
53 logging.error(f"Failed to fetch external data from {external_link}: {e}")
54 raise
55 except csv.Error as e:
56 logging.error(f"Failed to parse CSV data from {external_link}: {e}")
57 raise
58 except Exception as e:
59 logging.error(f"Unexpected error fetching external data: {e}")
60 raise
63def fetch_chunk_data(
64 external_links: List[Dict[str, Any]], chunk_index: int
65) -> Optional[List[List[str]]]:
66 """
67 Fetch data for a specific chunk by chunk_index.
69 Args:
70 external_links: List of external link objects from Databricks API response
71 chunk_index: Index of the chunk to fetch
73 Returns:
74 List of rows for the specified chunk, or None if chunk not found
75 """
76 # Find the external link for the specified chunk
77 target_link = None
78 for link in external_links:
79 if link.get("chunk_index") == chunk_index:
80 target_link = link
81 break
83 if not target_link:
84 logging.warning(f"No external link found for chunk_index {chunk_index}")
85 return None
87 external_url = target_link.get("external_link")
88 if not external_url:
89 logging.warning(f"No external_link URL found in chunk {chunk_index}")
90 return None
92 try:
93 return fetch_external_data(external_url)
94 except Exception as e:
95 logging.error(f"Failed to fetch chunk {chunk_index}: {e}")
96 raise
99def get_paginated_rows(
100 external_links: List[Dict[str, Any]], start_row: int, num_rows: int = 50
101) -> List[List[str]]:
102 """
103 Get a specific page of rows from external links.
105 Args:
106 external_links: List of external link objects from Databricks API response
107 start_row: Starting row index (0-based)
108 num_rows: Number of rows to fetch
110 Returns:
111 List of rows for the requested page
112 """
113 # Sort external links by chunk_index to ensure proper order
114 sorted_links = sorted(external_links, key=lambda x: x.get("chunk_index", 0))
116 current_row = 0
117 result_rows = []
119 for link in sorted_links:
120 chunk_row_count = link.get("row_count", 0)
121 chunk_start = current_row
122 chunk_end = current_row + chunk_row_count
124 # Check if this chunk contains any of our target rows
125 if start_row < chunk_end and current_row < start_row + num_rows:
126 # We need some data from this chunk
127 try:
128 chunk_data = fetch_chunk_data([link], link.get("chunk_index"))
129 if chunk_data:
130 # Calculate which rows from this chunk we need
131 local_start = max(0, start_row - chunk_start)
132 local_end = min(chunk_row_count, start_row + num_rows - chunk_start)
134 if local_start < len(chunk_data):
135 chunk_slice = chunk_data[local_start:local_end]
136 result_rows.extend(chunk_slice)
138 # If we have enough rows, we're done
139 if len(result_rows) >= num_rows:
140 return result_rows[:num_rows]
141 except Exception as e:
142 logging.error(f"Failed to fetch chunk {link.get('chunk_index')}: {e}")
143 # Continue with other chunks
145 current_row += chunk_row_count
147 # If we've passed our target range, we're done
148 if current_row >= start_row + num_rows:
149 break
151 return result_rows
154class PaginatedSQLResult:
155 """
156 Class to manage paginated SQL results with external data fetching.
157 """
159 def __init__(
160 self,
161 columns: List[str],
162 external_links: List[Dict[str, Any]],
163 total_row_count: int,
164 chunks: List[Dict[str, Any]],
165 ):
166 self.columns = columns
167 self.external_links = external_links
168 self.total_row_count = total_row_count
169 self.chunks = chunks
170 self.current_position = 0
171 self.page_size = 50
173 def get_next_page(self) -> tuple[List[List[str]], bool]:
174 """
175 Get the next page of results.
177 Returns:
178 Tuple of (rows, has_more) where rows is list of data rows
179 and has_more indicates if there are more pages available
180 """
181 if self.current_position >= self.total_row_count:
182 return [], False
184 rows = get_paginated_rows(
185 self.external_links, self.current_position, self.page_size
186 )
188 self.current_position += len(rows)
189 has_more = self.current_position < self.total_row_count
191 return rows, has_more
193 def reset(self):
194 """Reset pagination to the beginning."""
195 self.current_position = 0