Coverage for mcp_git_explorer/core.py: 62%
171 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-22 14:59 +0200
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-22 14:59 +0200
1import os
2import tempfile
3import shutil
4from pathlib import Path
5import textwrap
6from urllib.parse import urlparse
7import fnmatch
8from mcp.server.fastmcp import FastMCP
9from .settings import GitExplorerSettings
11class GitExplorer:
12 """Git Explorer tool for accessing and processing Git repositories."""
13 def __init__(self, name="Git Codebase Explorer", settings=None):
14 """Initialize the Git Explorer with optional custom name and settings."""
15 self.mcp = FastMCP(
16 name,
17 dependencies=["gitpython", "tiktoken"],
18 )
19 self.settings = settings or GitExplorerSettings()
20 # Register tools
21 self.mcp.tool()(self.get_codebase)
22 self.mcp.tool()(self.estimate_codebase)
23 self.mcp.tool()(self.check_gitlab_token_status)
25 def _process_repository(self, repo_url: str, use_token: bool = True):
26 """
27 Clone and process a Git repository, returning repository information.
29 Args:
30 repo_url (str): The URL of the Git repository to clone
31 use_token (bool): Whether to use GitLab token for authentication
33 Returns:
34 dict: Repository information including:
35 - temp_dir: Path to cloned repository
36 - repo_structure: Text representation of repo structure
37 - file_count: Number of files in repository
38 - files_content: Concatenated file contents (if requested)
39 - token_count: Estimated token count
40 - error: Error message (if any)
41 """
42 import git
43 import tiktoken
45 result = {
46 "temp_dir": None,
47 "repo_structure": "",
48 "file_count": 0,
49 "files_content": "",
50 "token_count": 0,
51 "error": None
52 }
54 authenticated_url = repo_url
55 if use_token and self.settings.gitlab_token:
56 parsed_url = urlparse(repo_url)
57 netloc = f"oauth2:{self.settings.gitlab_token}@{parsed_url.netloc}"
58 authenticated_url = parsed_url._replace(netloc=netloc).geturl()
60 temp_dir = tempfile.mkdtemp()
61 result["temp_dir"] = temp_dir
63 try:
64 # Clone the repository
65 git.Repo.clone_from(authenticated_url, temp_dir, depth=1)
67 # Remove .git directory
68 git_dir = os.path.join(temp_dir, ".git")
69 if os.path.exists(git_dir):
70 shutil.rmtree(git_dir)
72 # Get ignore patterns
73 ignore_patterns = []
74 gitignore_path = os.path.join(temp_dir, ".gitignore")
75 if os.path.exists(gitignore_path):
76 with open(gitignore_path, 'r', errors='replace') as f:
77 for line in f:
78 line = line.strip()
79 if line and not line.startswith('#'):
80 ignore_patterns.append(line)
82 repomixignore_path = os.path.join(temp_dir, ".repomixignore")
83 if os.path.exists(repomixignore_path):
84 with open(repomixignore_path, 'r', errors='replace') as f:
85 for line in f:
86 line = line.strip()
87 if line and not line.startswith('#'):
88 ignore_patterns.append(line)
90 # Generate repository structure
91 repo_structure = self._generate_repo_structure(temp_dir)
92 result["repo_structure"] = repo_structure
94 # Count files and generate content
95 files = []
96 root_path = Path(temp_dir)
97 for path in sorted(root_path.glob("**/*")):
98 if path.is_file() and not self._should_ignore_file(path, root_path, ignore_patterns) and not self._is_binary_file(path):
99 try:
100 content = path.read_text(errors='replace')
101 if content and content.strip():
102 files.append(path)
103 except Exception:
104 pass
106 result["file_count"] = len(files)
108 # Generate files content if needed
109 files_content = self._concatenate_files_from_list(root_path, files)
110 result["files_content"] = files_content
112 # Count tokens
113 enc = tiktoken.get_encoding("o200k_base")
115 # Create content for token estimation
116 sample_content = f"{repo_structure}\n\n{files_content}"
117 tokens = enc.encode(sample_content)
118 result["token_count"] = len(tokens)
120 return result
122 except git.GitCommandError as e:
123 if "Authentication failed" in str(e):
124 result["error"] = (
125 f"Authentication error while accessing repository {repo_url}.\n"
126 "Make sure the repository is public or a valid access token "
127 "has been set in the GIT_EXPLORER_GITLAB_TOKEN environment variable."
128 )
129 else:
130 result["error"] = f"Git error: {str(e)}"
131 return result
132 except Exception as e:
133 result["error"] = f"An error occurred: {str(e)}"
134 return result
136 async def estimate_codebase(self, repo_url: str, use_token: bool = True) -> str:
137 """
138 Get statistics about a Git repository without downloading all content.
140 This tool clones a git repository from the provided URL, analyzes its structure,
141 and returns statistical information useful for LLM processing, including:
142 - Estimated token count
143 - Total file count
144 - Repository structure
146 Args:
147 repo_url (str): The URL of the Git repository to clone
148 use_token (bool, optional): Whether to use the GitLab token for authentication.
149 Defaults to True.
151 Returns:
152 str: A formatted text representation of the repository statistics
154 Raises:
155 GitCommandError: If there is an error during the git clone operation
156 Exception: For any other errors that occur during processing
157 """
158 result = None
159 temp_dir = None
161 try:
162 # Process the repository
163 result = self._process_repository(repo_url, use_token)
164 temp_dir = result["temp_dir"]
166 if result["error"]:
167 return result["error"]
169 # Format the output
170 output = textwrap.dedent(f"""
171 # Git Repository Statistics: {repo_url}
173 ## Summary:
174 - Estimated token count (o200k_base encoding): {result["token_count"]:,}
175 - Total files: {result["file_count"]:,}
177 ## Repository Structure:
178 {result["repo_structure"]}
179 """).strip()
181 return output
183 finally:
184 if temp_dir and os.path.exists(temp_dir):
185 shutil.rmtree(temp_dir)
187 async def get_codebase(self, repo_url: str, use_token: bool = True) -> str:
188 """
189 Clone a Git repository and generate a structured text file containing its contents.
191 This tool clones a git repository from the provided URL, processes its contents,
192 and returns a single text file containing the repository structure and the content
193 of all files. Binary files and empty text files are excluded. The tool respects
194 .gitignore and .repomixignore patterns. The output includes an estimated token count
195 using the o200k_base encoding.
197 Args:
198 repo_url (str): The URL of the Git repository to clone
199 use_token (bool, optional): Whether to use the GitLab token for authentication.
200 Defaults to True.
202 Returns:
203 str: A formatted text representation of the repository contents, including
204 file structure, estimated token count, and the content of all text files.
206 Raises:
207 GitCommandError: If there is an error during the git clone operation
208 Exception: For any other errors that occur during processing
209 """
210 result = None
211 temp_dir = None
213 try:
214 # Process the repository
215 result = self._process_repository(repo_url, use_token)
216 temp_dir = result["temp_dir"]
218 if result["error"]:
219 return result["error"]
221 # Create preamble with token information
222 preamble = textwrap.dedent(f"""
223 # Git Repository: {repo_url}
224 This file contains the complete content of the git repository cloned from:
225 {repo_url}
226 Estimated token count (o200k_base encoding): {result["token_count"]:,}
227 Total files: {result["file_count"]:,}
228 Below you'll find the repository structure and the full content of all files.
229 Each file is preceded by a separator indicating the beginning of the file and
230 followed by a separator indicating the end of the file, along with the full path to the file.
232 ## Repository Structure:
233 {result["repo_structure"]}
235 ## File Contents:
236 """).strip()
238 # Create final content
239 output = f"{preamble}\n\n{result['files_content']}"
240 return output
242 finally:
243 if temp_dir and os.path.exists(temp_dir):
244 shutil.rmtree(temp_dir)
246 def check_gitlab_token_status(self) -> str:
247 """Check if the GitLab token is configured in the environment.
248 Returns:
249 A message indicating whether the GitLab token is configured
250 """
251 if self.settings.gitlab_token:
252 return "GitLab token is configured."
253 else:
254 return (
255 "GitLab token is not configured. "
256 "Set the GIT_EXPLORER_GITLAB_TOKEN environment variable "
257 "to access private GitLab repositories."
258 )
260 def run(self, transport: str = "stdio") -> None:
261 """Run the Git Explorer with the specified transport."""
262 self.mcp.run(transport=transport)
264 def _should_ignore_file(self, file_path: Path, root_path: Path, ignore_patterns: list[str]) -> bool:
265 # Convert to a path relative to the root directory
266 rel_path = file_path.relative_to(root_path)
267 rel_path_str = str(rel_path).replace(os.sep, '/')
268 # Check each pattern
269 for pattern in ignore_patterns:
270 # Handle pattern formats
271 if pattern.startswith('/'):
272 # Pattern starts with / - only match from root
273 pattern = pattern[1:]
274 if fnmatch.fnmatch(rel_path_str, pattern):
275 return True
276 elif pattern.endswith('/'):
277 # Pattern ends with / - match directories
278 if file_path.is_dir() and fnmatch.fnmatch(rel_path_str, pattern[:-1]):
279 return True
280 else:
281 # Standard pattern - match anywhere in path
282 if fnmatch.fnmatch(rel_path_str, pattern):
283 return True
284 # Also check if any parent directory matches the pattern
285 parts = rel_path_str.split('/')
286 for i in range(len(parts)):
287 partial_path = '/'.join(parts[:i+1])
288 if fnmatch.fnmatch(partial_path, pattern):
289 return True
290 return False
292 def _generate_repo_structure(self, repo_path: str) -> str:
293 result = []
294 def _add_directory(directory: Path, prefix: str = ""):
295 paths = sorted(directory.iterdir(), key=lambda p: (p.is_file(), p.name))
296 for i, path in enumerate(paths):
297 is_last = i == len(paths) - 1
298 result.append(f"{prefix}{'└── ' if is_last else '├── '}{path.name}")
299 if path.is_dir():
300 _add_directory(
301 path,
302 prefix + (' ' if is_last else '│ ')
303 )
304 _add_directory(Path(repo_path))
305 return "\n".join(result)
307 def _is_binary_file(self, file_path: Path) -> bool:
308 """Check if a file is binary by reading its first few thousand bytes."""
309 try:
310 chunk_size = 8000 # Read first 8K bytes
311 with open(file_path, 'rb') as f:
312 chunk = f.read(chunk_size)
313 # Check for null bytes which usually indicate binary content
314 if b'\x00' in chunk:
315 return True
316 # Check if the file is mostly text by looking at the ratio of printable to non-printable characters
317 text_characters = bytes(range(32, 127)) + b'\n\r\t\b'
318 # If more than 30% non-printable characters, it's likely binary
319 return sum(byte not in text_characters for byte in chunk) / len(chunk) > 0.3
320 except Exception:
321 # If we can't read it, assume it's binary to be safe
322 return True
324 def _concatenate_files_from_list(self, root_path: Path, files: list[Path]) -> str:
325 """Concatenate the contents of the given files into a single string."""
326 result = []
327 for path in files:
328 rel_path = path.relative_to(root_path)
329 try:
330 # Read file content
331 content = path.read_text(errors='replace')
332 # Skip empty files or files with only empty lines
333 if not content or not content.strip():
334 continue
335 # Add non-empty text file to result
336 result.append(f"=====< BEGIN filename: {rel_path} >=====\n")
337 result.append(content)
338 result.append(f"===== <END filename: {rel_path} >=====\n\n")
339 except Exception as e:
340 result.append(f"=====< BEGIN filename: {rel_path} >=====\n")
341 result.append(f"[Error reading file: {str(e)}]")
342 result.append(f"===== <END filename: {rel_path} >=====\n\n")
343 return "\n".join(result)
345 def _concatenate_files(self, repo_path: str, ignore_patterns: list[str]) -> str:
346 """Legacy method - uses _concatenate_files_from_list internally."""
347 result = []
348 root_path = Path(repo_path)
349 # Build a list of all files first, so we can sort them
350 all_files = []
351 for path in sorted(root_path.glob("**/*")):
352 if path.is_file():
353 if not self._should_ignore_file(path, root_path, ignore_patterns) and not self._is_binary_file(path):
354 all_files.append(path)
356 return self._concatenate_files_from_list(root_path, all_files)