Coverage for mcp_git_explorer/core.py: 62%

171 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-22 14:59 +0200

1import os 

2import tempfile 

3import shutil 

4from pathlib import Path 

5import textwrap 

6from urllib.parse import urlparse 

7import fnmatch 

8from mcp.server.fastmcp import FastMCP 

9from .settings import GitExplorerSettings 

10 

11class GitExplorer: 

12 """Git Explorer tool for accessing and processing Git repositories.""" 

13 def __init__(self, name="Git Codebase Explorer", settings=None): 

14 """Initialize the Git Explorer with optional custom name and settings.""" 

15 self.mcp = FastMCP( 

16 name, 

17 dependencies=["gitpython", "tiktoken"], 

18 ) 

19 self.settings = settings or GitExplorerSettings() 

20 # Register tools 

21 self.mcp.tool()(self.get_codebase) 

22 self.mcp.tool()(self.estimate_codebase) 

23 self.mcp.tool()(self.check_gitlab_token_status) 

24 

25 def _process_repository(self, repo_url: str, use_token: bool = True): 

26 """ 

27 Clone and process a Git repository, returning repository information. 

28  

29 Args: 

30 repo_url (str): The URL of the Git repository to clone 

31 use_token (bool): Whether to use GitLab token for authentication 

32  

33 Returns: 

34 dict: Repository information including: 

35 - temp_dir: Path to cloned repository 

36 - repo_structure: Text representation of repo structure 

37 - file_count: Number of files in repository 

38 - files_content: Concatenated file contents (if requested) 

39 - token_count: Estimated token count 

40 - error: Error message (if any) 

41 """ 

42 import git 

43 import tiktoken 

44 

45 result = { 

46 "temp_dir": None, 

47 "repo_structure": "", 

48 "file_count": 0, 

49 "files_content": "", 

50 "token_count": 0, 

51 "error": None 

52 } 

53 

54 authenticated_url = repo_url 

55 if use_token and self.settings.gitlab_token: 

56 parsed_url = urlparse(repo_url) 

57 netloc = f"oauth2:{self.settings.gitlab_token}@{parsed_url.netloc}" 

58 authenticated_url = parsed_url._replace(netloc=netloc).geturl() 

59 

60 temp_dir = tempfile.mkdtemp() 

61 result["temp_dir"] = temp_dir 

62 

63 try: 

64 # Clone the repository 

65 git.Repo.clone_from(authenticated_url, temp_dir, depth=1) 

66 

67 # Remove .git directory 

68 git_dir = os.path.join(temp_dir, ".git") 

69 if os.path.exists(git_dir): 

70 shutil.rmtree(git_dir) 

71 

72 # Get ignore patterns 

73 ignore_patterns = [] 

74 gitignore_path = os.path.join(temp_dir, ".gitignore") 

75 if os.path.exists(gitignore_path): 

76 with open(gitignore_path, 'r', errors='replace') as f: 

77 for line in f: 

78 line = line.strip() 

79 if line and not line.startswith('#'): 

80 ignore_patterns.append(line) 

81 

82 repomixignore_path = os.path.join(temp_dir, ".repomixignore") 

83 if os.path.exists(repomixignore_path): 

84 with open(repomixignore_path, 'r', errors='replace') as f: 

85 for line in f: 

86 line = line.strip() 

87 if line and not line.startswith('#'): 

88 ignore_patterns.append(line) 

89 

90 # Generate repository structure 

91 repo_structure = self._generate_repo_structure(temp_dir) 

92 result["repo_structure"] = repo_structure 

93 

94 # Count files and generate content 

95 files = [] 

96 root_path = Path(temp_dir) 

97 for path in sorted(root_path.glob("**/*")): 

98 if path.is_file() and not self._should_ignore_file(path, root_path, ignore_patterns) and not self._is_binary_file(path): 

99 try: 

100 content = path.read_text(errors='replace') 

101 if content and content.strip(): 

102 files.append(path) 

103 except Exception: 

104 pass 

105 

106 result["file_count"] = len(files) 

107 

108 # Generate files content if needed 

109 files_content = self._concatenate_files_from_list(root_path, files) 

110 result["files_content"] = files_content 

111 

112 # Count tokens 

113 enc = tiktoken.get_encoding("o200k_base") 

114 

115 # Create content for token estimation 

116 sample_content = f"{repo_structure}\n\n{files_content}" 

117 tokens = enc.encode(sample_content) 

118 result["token_count"] = len(tokens) 

119 

120 return result 

121 

122 except git.GitCommandError as e: 

123 if "Authentication failed" in str(e): 

124 result["error"] = ( 

125 f"Authentication error while accessing repository {repo_url}.\n" 

126 "Make sure the repository is public or a valid access token " 

127 "has been set in the GIT_EXPLORER_GITLAB_TOKEN environment variable." 

128 ) 

129 else: 

130 result["error"] = f"Git error: {str(e)}" 

131 return result 

132 except Exception as e: 

133 result["error"] = f"An error occurred: {str(e)}" 

134 return result 

135 

136 async def estimate_codebase(self, repo_url: str, use_token: bool = True) -> str: 

137 """ 

138 Get statistics about a Git repository without downloading all content. 

139  

140 This tool clones a git repository from the provided URL, analyzes its structure, 

141 and returns statistical information useful for LLM processing, including: 

142 - Estimated token count 

143 - Total file count 

144 - Repository structure 

145  

146 Args: 

147 repo_url (str): The URL of the Git repository to clone 

148 use_token (bool, optional): Whether to use the GitLab token for authentication. 

149 Defaults to True. 

150  

151 Returns: 

152 str: A formatted text representation of the repository statistics 

153  

154 Raises: 

155 GitCommandError: If there is an error during the git clone operation 

156 Exception: For any other errors that occur during processing 

157 """ 

158 result = None 

159 temp_dir = None 

160 

161 try: 

162 # Process the repository 

163 result = self._process_repository(repo_url, use_token) 

164 temp_dir = result["temp_dir"] 

165 

166 if result["error"]: 

167 return result["error"] 

168 

169 # Format the output 

170 output = textwrap.dedent(f""" 

171 # Git Repository Statistics: {repo_url} 

172  

173 ## Summary: 

174 - Estimated token count (o200k_base encoding): {result["token_count"]:,} 

175 - Total files: {result["file_count"]:,} 

176  

177 ## Repository Structure: 

178 {result["repo_structure"]} 

179 """).strip() 

180 

181 return output 

182 

183 finally: 

184 if temp_dir and os.path.exists(temp_dir): 

185 shutil.rmtree(temp_dir) 

186 

187 async def get_codebase(self, repo_url: str, use_token: bool = True) -> str: 

188 """ 

189 Clone a Git repository and generate a structured text file containing its contents. 

190  

191 This tool clones a git repository from the provided URL, processes its contents, 

192 and returns a single text file containing the repository structure and the content 

193 of all files. Binary files and empty text files are excluded. The tool respects 

194 .gitignore and .repomixignore patterns. The output includes an estimated token count 

195 using the o200k_base encoding. 

196  

197 Args: 

198 repo_url (str): The URL of the Git repository to clone 

199 use_token (bool, optional): Whether to use the GitLab token for authentication. 

200 Defaults to True. 

201  

202 Returns: 

203 str: A formatted text representation of the repository contents, including 

204 file structure, estimated token count, and the content of all text files. 

205  

206 Raises: 

207 GitCommandError: If there is an error during the git clone operation 

208 Exception: For any other errors that occur during processing 

209 """ 

210 result = None 

211 temp_dir = None 

212 

213 try: 

214 # Process the repository 

215 result = self._process_repository(repo_url, use_token) 

216 temp_dir = result["temp_dir"] 

217 

218 if result["error"]: 

219 return result["error"] 

220 

221 # Create preamble with token information 

222 preamble = textwrap.dedent(f""" 

223 # Git Repository: {repo_url} 

224 This file contains the complete content of the git repository cloned from: 

225 {repo_url} 

226 Estimated token count (o200k_base encoding): {result["token_count"]:,} 

227 Total files: {result["file_count"]:,} 

228 Below you'll find the repository structure and the full content of all files. 

229 Each file is preceded by a separator indicating the beginning of the file and 

230 followed by a separator indicating the end of the file, along with the full path to the file. 

231  

232 ## Repository Structure: 

233 {result["repo_structure"]} 

234  

235 ## File Contents: 

236 """).strip() 

237 

238 # Create final content 

239 output = f"{preamble}\n\n{result['files_content']}" 

240 return output 

241 

242 finally: 

243 if temp_dir and os.path.exists(temp_dir): 

244 shutil.rmtree(temp_dir) 

245 

246 def check_gitlab_token_status(self) -> str: 

247 """Check if the GitLab token is configured in the environment. 

248 Returns: 

249 A message indicating whether the GitLab token is configured 

250 """ 

251 if self.settings.gitlab_token: 

252 return "GitLab token is configured." 

253 else: 

254 return ( 

255 "GitLab token is not configured. " 

256 "Set the GIT_EXPLORER_GITLAB_TOKEN environment variable " 

257 "to access private GitLab repositories." 

258 ) 

259 

260 def run(self, transport: str = "stdio") -> None: 

261 """Run the Git Explorer with the specified transport.""" 

262 self.mcp.run(transport=transport) 

263 

264 def _should_ignore_file(self, file_path: Path, root_path: Path, ignore_patterns: list[str]) -> bool: 

265 # Convert to a path relative to the root directory 

266 rel_path = file_path.relative_to(root_path) 

267 rel_path_str = str(rel_path).replace(os.sep, '/') 

268 # Check each pattern 

269 for pattern in ignore_patterns: 

270 # Handle pattern formats 

271 if pattern.startswith('/'): 

272 # Pattern starts with / - only match from root 

273 pattern = pattern[1:] 

274 if fnmatch.fnmatch(rel_path_str, pattern): 

275 return True 

276 elif pattern.endswith('/'): 

277 # Pattern ends with / - match directories 

278 if file_path.is_dir() and fnmatch.fnmatch(rel_path_str, pattern[:-1]): 

279 return True 

280 else: 

281 # Standard pattern - match anywhere in path 

282 if fnmatch.fnmatch(rel_path_str, pattern): 

283 return True 

284 # Also check if any parent directory matches the pattern 

285 parts = rel_path_str.split('/') 

286 for i in range(len(parts)): 

287 partial_path = '/'.join(parts[:i+1]) 

288 if fnmatch.fnmatch(partial_path, pattern): 

289 return True 

290 return False 

291 

292 def _generate_repo_structure(self, repo_path: str) -> str: 

293 result = [] 

294 def _add_directory(directory: Path, prefix: str = ""): 

295 paths = sorted(directory.iterdir(), key=lambda p: (p.is_file(), p.name)) 

296 for i, path in enumerate(paths): 

297 is_last = i == len(paths) - 1 

298 result.append(f"{prefix}{'└── ' if is_last else '├── '}{path.name}") 

299 if path.is_dir(): 

300 _add_directory( 

301 path, 

302 prefix + (' ' if is_last else '│ ') 

303 ) 

304 _add_directory(Path(repo_path)) 

305 return "\n".join(result) 

306 

307 def _is_binary_file(self, file_path: Path) -> bool: 

308 """Check if a file is binary by reading its first few thousand bytes.""" 

309 try: 

310 chunk_size = 8000 # Read first 8K bytes 

311 with open(file_path, 'rb') as f: 

312 chunk = f.read(chunk_size) 

313 # Check for null bytes which usually indicate binary content 

314 if b'\x00' in chunk: 

315 return True 

316 # Check if the file is mostly text by looking at the ratio of printable to non-printable characters 

317 text_characters = bytes(range(32, 127)) + b'\n\r\t\b' 

318 # If more than 30% non-printable characters, it's likely binary 

319 return sum(byte not in text_characters for byte in chunk) / len(chunk) > 0.3 

320 except Exception: 

321 # If we can't read it, assume it's binary to be safe 

322 return True 

323 

324 def _concatenate_files_from_list(self, root_path: Path, files: list[Path]) -> str: 

325 """Concatenate the contents of the given files into a single string.""" 

326 result = [] 

327 for path in files: 

328 rel_path = path.relative_to(root_path) 

329 try: 

330 # Read file content 

331 content = path.read_text(errors='replace') 

332 # Skip empty files or files with only empty lines 

333 if not content or not content.strip(): 

334 continue 

335 # Add non-empty text file to result 

336 result.append(f"=====< BEGIN filename: {rel_path} >=====\n") 

337 result.append(content) 

338 result.append(f"===== <END filename: {rel_path} >=====\n\n") 

339 except Exception as e: 

340 result.append(f"=====< BEGIN filename: {rel_path} >=====\n") 

341 result.append(f"[Error reading file: {str(e)}]") 

342 result.append(f"===== <END filename: {rel_path} >=====\n\n") 

343 return "\n".join(result) 

344 

345 def _concatenate_files(self, repo_path: str, ignore_patterns: list[str]) -> str: 

346 """Legacy method - uses _concatenate_files_from_list internally.""" 

347 result = [] 

348 root_path = Path(repo_path) 

349 # Build a list of all files first, so we can sort them 

350 all_files = [] 

351 for path in sorted(root_path.glob("**/*")): 

352 if path.is_file(): 

353 if not self._should_ignore_file(path, root_path, ignore_patterns) and not self._is_binary_file(path): 

354 all_files.append(path) 

355 

356 return self._concatenate_files_from_list(root_path, all_files)