Coverage for src/chuck_data/commands/sql_external_data.py: 0%

88 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-06-05 22:56 -0700

1""" 

2Utility for fetching and parsing external SQL result data from Databricks. 

3 

4When SQL queries return large result sets, Databricks provides external_links 

5to CSV files containing the data. This module handles fetching and parsing 

6that external data. 

7""" 

8 

9import csv 

10import io 

11import logging 

12import requests 

13from typing import List, Dict, Any, Optional 

14from urllib.parse import urlparse 

15 

16 

17def fetch_external_data(external_link: str, timeout: int = 30) -> List[List[str]]: 

18 """ 

19 Fetch CSV data from an external link and parse it into rows. 

20 

21 Args: 

22 external_link: URL to fetch CSV data from 

23 timeout: Request timeout in seconds 

24 

25 Returns: 

26 List of rows, where each row is a list of string values 

27 

28 Raises: 

29 requests.RequestException: If HTTP request fails 

30 csv.Error: If CSV parsing fails 

31 """ 

32 try: 

33 logging.debug(f"Fetching external SQL data from: {external_link}") 

34 

35 # Validate URL 

36 parsed_url = urlparse(external_link) 

37 if not parsed_url.scheme or not parsed_url.netloc: 

38 raise ValueError(f"Invalid URL: {external_link}") 

39 

40 # Fetch the CSV data 

41 response = requests.get(external_link, timeout=timeout) 

42 response.raise_for_status() 

43 

44 # Parse CSV data 

45 csv_content = response.text 

46 csv_reader = csv.reader(io.StringIO(csv_content)) 

47 rows = list(csv_reader) 

48 

49 logging.debug(f"Successfully fetched {len(rows)} rows from external link") 

50 return rows 

51 

52 except requests.RequestException as e: 

53 logging.error(f"Failed to fetch external data from {external_link}: {e}") 

54 raise 

55 except csv.Error as e: 

56 logging.error(f"Failed to parse CSV data from {external_link}: {e}") 

57 raise 

58 except Exception as e: 

59 logging.error(f"Unexpected error fetching external data: {e}") 

60 raise 

61 

62 

63def fetch_chunk_data( 

64 external_links: List[Dict[str, Any]], chunk_index: int 

65) -> Optional[List[List[str]]]: 

66 """ 

67 Fetch data for a specific chunk by chunk_index. 

68 

69 Args: 

70 external_links: List of external link objects from Databricks API response 

71 chunk_index: Index of the chunk to fetch 

72 

73 Returns: 

74 List of rows for the specified chunk, or None if chunk not found 

75 """ 

76 # Find the external link for the specified chunk 

77 target_link = None 

78 for link in external_links: 

79 if link.get("chunk_index") == chunk_index: 

80 target_link = link 

81 break 

82 

83 if not target_link: 

84 logging.warning(f"No external link found for chunk_index {chunk_index}") 

85 return None 

86 

87 external_url = target_link.get("external_link") 

88 if not external_url: 

89 logging.warning(f"No external_link URL found in chunk {chunk_index}") 

90 return None 

91 

92 try: 

93 return fetch_external_data(external_url) 

94 except Exception as e: 

95 logging.error(f"Failed to fetch chunk {chunk_index}: {e}") 

96 raise 

97 

98 

99def get_paginated_rows( 

100 external_links: List[Dict[str, Any]], start_row: int, num_rows: int = 50 

101) -> List[List[str]]: 

102 """ 

103 Get a specific page of rows from external links. 

104 

105 Args: 

106 external_links: List of external link objects from Databricks API response 

107 start_row: Starting row index (0-based) 

108 num_rows: Number of rows to fetch 

109 

110 Returns: 

111 List of rows for the requested page 

112 """ 

113 # Sort external links by chunk_index to ensure proper order 

114 sorted_links = sorted(external_links, key=lambda x: x.get("chunk_index", 0)) 

115 

116 current_row = 0 

117 result_rows = [] 

118 

119 for link in sorted_links: 

120 chunk_row_count = link.get("row_count", 0) 

121 chunk_start = current_row 

122 chunk_end = current_row + chunk_row_count 

123 

124 # Check if this chunk contains any of our target rows 

125 if start_row < chunk_end and current_row < start_row + num_rows: 

126 # We need some data from this chunk 

127 try: 

128 chunk_data = fetch_chunk_data([link], link.get("chunk_index")) 

129 if chunk_data: 

130 # Calculate which rows from this chunk we need 

131 local_start = max(0, start_row - chunk_start) 

132 local_end = min(chunk_row_count, start_row + num_rows - chunk_start) 

133 

134 if local_start < len(chunk_data): 

135 chunk_slice = chunk_data[local_start:local_end] 

136 result_rows.extend(chunk_slice) 

137 

138 # If we have enough rows, we're done 

139 if len(result_rows) >= num_rows: 

140 return result_rows[:num_rows] 

141 except Exception as e: 

142 logging.error(f"Failed to fetch chunk {link.get('chunk_index')}: {e}") 

143 # Continue with other chunks 

144 

145 current_row += chunk_row_count 

146 

147 # If we've passed our target range, we're done 

148 if current_row >= start_row + num_rows: 

149 break 

150 

151 return result_rows 

152 

153 

154class PaginatedSQLResult: 

155 """ 

156 Class to manage paginated SQL results with external data fetching. 

157 """ 

158 

159 def __init__( 

160 self, 

161 columns: List[str], 

162 external_links: List[Dict[str, Any]], 

163 total_row_count: int, 

164 chunks: List[Dict[str, Any]], 

165 ): 

166 self.columns = columns 

167 self.external_links = external_links 

168 self.total_row_count = total_row_count 

169 self.chunks = chunks 

170 self.current_position = 0 

171 self.page_size = 50 

172 

173 def get_next_page(self) -> tuple[List[List[str]], bool]: 

174 """ 

175 Get the next page of results. 

176 

177 Returns: 

178 Tuple of (rows, has_more) where rows is list of data rows 

179 and has_more indicates if there are more pages available 

180 """ 

181 if self.current_position >= self.total_row_count: 

182 return [], False 

183 

184 rows = get_paginated_rows( 

185 self.external_links, self.current_position, self.page_size 

186 ) 

187 

188 self.current_position += len(rows) 

189 has_more = self.current_position < self.total_row_count 

190 

191 return rows, has_more 

192 

193 def reset(self): 

194 """Reset pagination to the beginning.""" 

195 self.current_position = 0