Coverage for src/chuck_data/databricks/url_utils.py: 0%

40 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-06-05 22:56 -0700

1"""Utilities for handling Databricks workspace URLs.""" 

2 

3from __future__ import annotations 

4 

5from typing import Optional, Tuple 

6from urllib.parse import urlparse 

7 

8# Mapping of cloud provider to base domain 

9DATABRICKS_DOMAIN_MAP = { 

10 "AWS": "cloud.databricks.com", 

11 "Azure": "azuredatabricks.net", 

12 "GCP": "gcp.databricks.com", 

13 "Generic": "databricks.com", 

14} 

15 

16# Reverse map of domain to provider for validation/detection 

17DATABRICKS_DOMAINS = {v: k for k, v in DATABRICKS_DOMAIN_MAP.items()} 

18 

19# URL validation regex patterns 

20# Databricks workspace IDs are typically numeric or alphanumeric 

21# Common formats: long numbers, or letters followed by numbers/hyphens 

22WORKSPACE_ID_PATTERN = r"^([0-9]{10,}|[a-z0-9][a-z0-9\-]*[a-z0-9]|[a-z0-9]{3,})$" 

23 

24 

25def normalize_workspace_url(url: str) -> str: 

26 """Return just the workspace identifier portion of a URL.""" 

27 if not url: 

28 return "" 

29 

30 to_parse = url if "://" in url else f"https://{url}" 

31 parsed = urlparse(to_parse) 

32 host = parsed.hostname or "" 

33 

34 for domain in DATABRICKS_DOMAIN_MAP.values(): 

35 if host.endswith(domain): 

36 host = host[: -(len(domain) + 1)] 

37 break 

38 

39 return host 

40 

41 

42def validate_workspace_url(url: str) -> Tuple[bool, Optional[str]]: 

43 """Validate that ``url`` is a plausible Databricks workspace URL.""" 

44 if not url: 

45 return False, "Workspace URL cannot be empty" 

46 

47 if not isinstance(url, str): 

48 return False, "Workspace URL must be a string" 

49 

50 # Basic validation - just check it's reasonable input, let API calls handle validity 

51 url_clean = url.strip() 

52 

53 # Should be reasonable length 

54 if len(url_clean) < 1 or len(url_clean) > 200: 

55 return False, "Workspace URL should be between 1-200 characters" 

56 

57 # No whitespace allowed 

58 if " " in url_clean: 

59 return False, "Workspace URL cannot contain spaces" 

60 

61 return True, None 

62 

63 

64def get_full_workspace_url(workspace_id: str, cloud_provider: str = "AWS") -> str: 

65 """Return the full workspace URL for ``workspace_id``.""" 

66 domain = DATABRICKS_DOMAIN_MAP.get(cloud_provider, DATABRICKS_DOMAIN_MAP["AWS"]) 

67 return f"https://{workspace_id}.{domain}" 

68 

69 

70def detect_cloud_provider(url: str) -> str: 

71 """Infer the cloud provider from ``url``.""" 

72 if not url: 

73 return "AWS" # Default to AWS if no URL provided 

74 

75 for domain, provider in DATABRICKS_DOMAINS.items(): 

76 if domain in url: 

77 return provider 

78 return "AWS" 

79 

80 

81def format_workspace_url_for_display( 

82 workspace_id: str, cloud_provider: str = "AWS" 

83) -> str: 

84 """Format a workspace URL for display to users.""" 

85 return get_full_workspace_url(workspace_id, cloud_provider)