Coverage for src/mcp_atlassian/preprocessing.py: 87%
83 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-22 16:34 +0900
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-22 16:34 +0900
1import logging
2import re
3import warnings
5from bs4 import BeautifulSoup
6from markdownify import markdownify as md
8logger = logging.getLogger("mcp-atlassian")
11class TextPreprocessor:
12 """Handles text preprocessing for Confluence and Jira content."""
14 def __init__(self, base_url: str, confluence_client=None):
15 self.base_url = base_url.rstrip("/")
16 self.confluence_client = confluence_client
18 def process_html_content(self, html_content: str, space_key: str = "") -> tuple[str, str]:
19 """Process HTML content to replace user refs and page links."""
20 try:
21 soup = BeautifulSoup(html_content, "html.parser")
23 # Process user mentions
24 user_mentions = soup.find_all("ri:user")
25 for user in user_mentions:
26 account_id = user.get("ri:account-id")
27 if account_id and self.confluence_client:
28 try:
29 # Fetch user info using the Confluence API
30 user_info = self.confluence_client.get_user_details_by_accountid(account_id)
31 display_name = user_info.get("displayName", account_id)
33 # Replace the entire ac:link structure with @mention
34 link_tag = user.find_parent("ac:link")
35 if link_tag:
36 link_tag.replace_with(f"@{display_name}")
37 except Exception as e:
38 logger.warning(f"Could not fetch user info for {account_id}: {e}")
39 # Fallback: just use the account ID
40 link_tag = user.find_parent("ac:link")
41 if link_tag:
42 link_tag.replace_with(f"@user_{account_id}")
44 processed_html = str(soup)
45 processed_markdown = md(processed_html)
47 return processed_html, processed_markdown
49 except Exception as e:
50 logger.error(f"Error in process_html_content: {str(e)}")
51 raise
53 def clean_jira_text(self, text: str) -> str:
54 """
55 Clean Jira text content by:
56 1. Processing user mentions and links
57 2. Converting HTML/wiki markup to markdown
58 """
59 if not text:
60 return ""
62 # Process user mentions
63 mention_pattern = r"\[~accountid:(.*?)\]"
64 text = self._process_mentions(text, mention_pattern)
66 # Process Jira smart links
67 text = self._process_smart_links(text)
69 # Convert HTML to markdown if needed
70 text = self._convert_html_to_markdown(text)
72 return text.strip()
74 def _process_mentions(self, text: str, pattern: str) -> str:
75 """Process user mentions in text."""
76 mentions = re.findall(pattern, text)
77 for account_id in mentions:
78 try:
79 # Note: This is a placeholder - actual user fetching should be injected
80 display_name = f"User:{account_id}"
81 text = text.replace(f"[~accountid:{account_id}]", display_name)
82 except Exception as e:
83 logger.error(f"Error getting user info for {account_id}: {str(e)}")
84 return text
86 def _process_smart_links(self, text: str) -> str:
87 """Process Jira/Confluence smart links."""
88 # Pattern matches: [text|url|smart-link]
89 link_pattern = r"\[(.*?)\|(.*?)\|smart-link\]"
90 matches = re.finditer(link_pattern, text)
92 for match in matches:
93 full_match = match.group(0)
94 link_text = match.group(1)
95 link_url = match.group(2)
97 # Extract issue key if it's a Jira issue link
98 issue_key_match = re.search(r"browse/([A-Z]+-\d+)", link_url)
99 # Check if it's a Confluence wiki link
100 confluence_match = re.search(r"wiki/spaces/.+?/pages/\d+/(.+?)(?:\?|$)", link_url)
102 if issue_key_match:
103 issue_key = issue_key_match.group(1)
104 clean_url = f"{self.base_url}/browse/{issue_key}"
105 text = text.replace(full_match, f"[{issue_key}]({clean_url})")
106 elif confluence_match:
107 url_title = confluence_match.group(1)
108 readable_title = url_title.replace("+", " ")
109 readable_title = re.sub(r"^[A-Z]+-\d+\s+", "", readable_title)
110 text = text.replace(full_match, f"[{readable_title}]({link_url})")
111 else:
112 clean_url = link_url.split("?")[0]
113 text = text.replace(full_match, f"[{link_text}]({clean_url})")
115 return text
117 def _convert_html_to_markdown(self, text: str) -> str:
118 """Convert HTML content to markdown if needed."""
119 if re.search(r"<[^>]+>", text):
120 try:
121 with warnings.catch_warnings():
122 warnings.filterwarnings("ignore", category=UserWarning)
123 soup = BeautifulSoup(f"<div>{text}</div>", "html.parser")
124 html = str(soup.div.decode_contents()) if soup.div else text
125 text = md(html)
126 except Exception as e:
127 logger.warning(f"Error converting HTML to markdown: {e}")
128 return text