Coverage for /Users/davegaeddert/Developer/dropseed/plain/plain/plain/utils/text.py: 29%

243 statements  

« prev     ^ index     » next       coverage.py v7.6.9, created at 2024-12-23 11:16 -0600

1import gzip 

2import re 

3import secrets 

4import unicodedata 

5from gzip import GzipFile 

6from gzip import compress as gzip_compress 

7from io import BytesIO 

8 

9from plain.exceptions import SuspiciousFileOperation 

10from plain.utils.functional import SimpleLazyObject, keep_lazy_text, lazy 

11from plain.utils.regex_helper import _lazy_re_compile 

12 

13 

14@keep_lazy_text 

15def capfirst(x): 

16 """Capitalize the first letter of a string.""" 

17 if not x: 

18 return x 

19 if not isinstance(x, str): 

20 x = str(x) 

21 return x[0].upper() + x[1:] 

22 

23 

24# Set up regular expressions 

25re_words = _lazy_re_compile(r"<[^>]+?>|([^<>\s]+)", re.S) 

26re_chars = _lazy_re_compile(r"<[^>]+?>|(.)", re.S) 

27re_tag = _lazy_re_compile(r"<(/)?(\S+?)(?:(\s*/)|\s.*?)?>", re.S) 

28re_newlines = _lazy_re_compile(r"\r\n|\r") # Used in normalize_newlines 

29re_camel_case = _lazy_re_compile(r"(((?<=[a-z])[A-Z])|([A-Z](?![A-Z]|$)))") 

30 

31 

32@keep_lazy_text 

33def wrap(text, width): 

34 """ 

35 A word-wrap function that preserves existing line breaks. Expects that 

36 existing line breaks are posix newlines. 

37 

38 Preserve all white space except added line breaks consume the space on 

39 which they break the line. 

40 

41 Don't wrap long words, thus the output text may have lines longer than 

42 ``width``. 

43 """ 

44 

45 def _generator(): 

46 for line in text.splitlines(True): # True keeps trailing linebreaks 

47 max_width = min((line.endswith("\n") and width + 1 or width), width) 

48 while len(line) > max_width: 

49 space = line[: max_width + 1].rfind(" ") + 1 

50 if space == 0: 

51 space = line.find(" ") + 1 

52 if space == 0: 

53 yield line 

54 line = "" 

55 break 

56 yield f"{line[: space - 1]}\n" 

57 line = line[space:] 

58 max_width = min((line.endswith("\n") and width + 1 or width), width) 

59 if line: 

60 yield line 

61 

62 return "".join(_generator()) 

63 

64 

65class Truncator(SimpleLazyObject): 

66 """ 

67 An object used to truncate text, either by characters or words. 

68 """ 

69 

70 def __init__(self, text): 

71 super().__init__(lambda: str(text)) 

72 

73 def add_truncation_text(self, text, truncate=None): 

74 if truncate is None: 

75 truncate = "%(truncated_text)s…" 

76 if "%(truncated_text)s" in truncate: 

77 return truncate % {"truncated_text": text} 

78 # The truncation text didn't contain the %(truncated_text)s string 

79 # replacement argument so just append it to the text. 

80 if text.endswith(truncate): 

81 # But don't append the truncation text if the current text already 

82 # ends in this. 

83 return text 

84 return f"{text}{truncate}" 

85 

86 def chars(self, num, truncate=None, html=False): 

87 """ 

88 Return the text truncated to be no longer than the specified number 

89 of characters. 

90 

91 `truncate` specifies what should be used to notify that the string has 

92 been truncated, defaulting to a translatable string of an ellipsis. 

93 """ 

94 self._setup() 

95 length = int(num) 

96 text = unicodedata.normalize("NFC", self._wrapped) 

97 

98 # Calculate the length to truncate to (max length - end_text length) 

99 truncate_len = length 

100 for char in self.add_truncation_text("", truncate): 

101 if not unicodedata.combining(char): 

102 truncate_len -= 1 

103 if truncate_len == 0: 

104 break 

105 if html: 

106 return self._truncate_html(length, truncate, text, truncate_len, False) 

107 return self._text_chars(length, truncate, text, truncate_len) 

108 

109 def _text_chars(self, length, truncate, text, truncate_len): 

110 """Truncate a string after a certain number of chars.""" 

111 s_len = 0 

112 end_index = None 

113 for i, char in enumerate(text): 

114 if unicodedata.combining(char): 

115 # Don't consider combining characters 

116 # as adding to the string length 

117 continue 

118 s_len += 1 

119 if end_index is None and s_len > truncate_len: 

120 end_index = i 

121 if s_len > length: 

122 # Return the truncated string 

123 return self.add_truncation_text(text[: end_index or 0], truncate) 

124 

125 # Return the original string since no truncation was necessary 

126 return text 

127 

128 def words(self, num, truncate=None, html=False): 

129 """ 

130 Truncate a string after a certain number of words. `truncate` specifies 

131 what should be used to notify that the string has been truncated, 

132 defaulting to ellipsis. 

133 """ 

134 self._setup() 

135 length = int(num) 

136 if html: 

137 return self._truncate_html(length, truncate, self._wrapped, length, True) 

138 return self._text_words(length, truncate) 

139 

140 def _text_words(self, length, truncate): 

141 """ 

142 Truncate a string after a certain number of words. 

143 

144 Strip newlines in the string. 

145 """ 

146 words = self._wrapped.split() 

147 if len(words) > length: 

148 words = words[:length] 

149 return self.add_truncation_text(" ".join(words), truncate) 

150 return " ".join(words) 

151 

152 def _truncate_html(self, length, truncate, text, truncate_len, words): 

153 """ 

154 Truncate HTML to a certain number of chars (not counting tags and 

155 comments), or, if words is True, then to a certain number of words. 

156 Close opened tags if they were correctly closed in the given HTML. 

157 

158 Preserve newlines in the HTML. 

159 """ 

160 if words and length <= 0: 

161 return "" 

162 

163 html4_singlets = ( 

164 "br", 

165 "col", 

166 "link", 

167 "base", 

168 "img", 

169 "param", 

170 "area", 

171 "hr", 

172 "input", 

173 ) 

174 

175 # Count non-HTML chars/words and keep note of open tags 

176 pos = 0 

177 end_text_pos = 0 

178 current_len = 0 

179 open_tags = [] 

180 

181 regex = re_words if words else re_chars 

182 

183 while current_len <= length: 

184 m = regex.search(text, pos) 

185 if not m: 

186 # Checked through whole string 

187 break 

188 pos = m.end(0) 

189 if m[1]: 

190 # It's an actual non-HTML word or char 

191 current_len += 1 

192 if current_len == truncate_len: 

193 end_text_pos = pos 

194 continue 

195 # Check for tag 

196 tag = re_tag.match(m[0]) 

197 if not tag or current_len >= truncate_len: 

198 # Don't worry about non tags or tags after our truncate point 

199 continue 

200 closing_tag, tagname, self_closing = tag.groups() 

201 # Element names are always case-insensitive 

202 tagname = tagname.lower() 

203 if self_closing or tagname in html4_singlets: 

204 pass 

205 elif closing_tag: 

206 # Check for match in open tags list 

207 try: 

208 i = open_tags.index(tagname) 

209 except ValueError: 

210 pass 

211 else: 

212 # SGML: An end tag closes, back to the matching start tag, 

213 # all unclosed intervening start tags with omitted end tags 

214 open_tags = open_tags[i + 1 :] 

215 else: 

216 # Add it to the start of the open tags list 

217 open_tags.insert(0, tagname) 

218 

219 if current_len <= length: 

220 return text 

221 out = text[:end_text_pos] 

222 truncate_text = self.add_truncation_text("", truncate) 

223 if truncate_text: 

224 out += truncate_text 

225 # Close any tags still open 

226 for tag in open_tags: 

227 out += f"</{tag}>" 

228 # Return string 

229 return out 

230 

231 

232@keep_lazy_text 

233def get_valid_filename(name): 

234 """ 

235 Return the given string converted to a string that can be used for a clean 

236 filename. Remove leading and trailing spaces; convert other spaces to 

237 underscores; and remove anything that is not an alphanumeric, dash, 

238 underscore, or dot. 

239 >>> get_valid_filename("john's portrait in 2004.jpg") 

240 'johns_portrait_in_2004.jpg' 

241 """ 

242 s = str(name).strip().replace(" ", "_") 

243 s = re.sub(r"(?u)[^-\w.]", "", s) 

244 if s in {"", ".", ".."}: 

245 raise SuspiciousFileOperation(f"Could not derive file name from '{name}'") 

246 return s 

247 

248 

249@keep_lazy_text 

250def get_text_list(list_, last_word="or"): 

251 """ 

252 >>> get_text_list(['a', 'b', 'c', 'd']) 

253 'a, b, c or d' 

254 >>> get_text_list(['a', 'b', 'c'], 'and') 

255 'a, b and c' 

256 >>> get_text_list(['a', 'b'], 'and') 

257 'a and b' 

258 >>> get_text_list(['a']) 

259 'a' 

260 >>> get_text_list([]) 

261 '' 

262 """ 

263 if not list_: 

264 return "" 

265 if len(list_) == 1: 

266 return str(list_[0]) 

267 return "{} {} {}".format( 

268 # Translators: This string is used as a separator between list elements 

269 ", ".join(str(i) for i in list_[:-1]), 

270 str(last_word), 

271 str(list_[-1]), 

272 ) 

273 

274 

275@keep_lazy_text 

276def normalize_newlines(text): 

277 """Normalize CRLF and CR newlines to just LF.""" 

278 return re_newlines.sub("\n", str(text)) 

279 

280 

281@keep_lazy_text 

282def phone2numeric(phone): 

283 """Convert a phone number with letters into its numeric equivalent.""" 

284 char2number = { 

285 "a": "2", 

286 "b": "2", 

287 "c": "2", 

288 "d": "3", 

289 "e": "3", 

290 "f": "3", 

291 "g": "4", 

292 "h": "4", 

293 "i": "4", 

294 "j": "5", 

295 "k": "5", 

296 "l": "5", 

297 "m": "6", 

298 "n": "6", 

299 "o": "6", 

300 "p": "7", 

301 "q": "7", 

302 "r": "7", 

303 "s": "7", 

304 "t": "8", 

305 "u": "8", 

306 "v": "8", 

307 "w": "9", 

308 "x": "9", 

309 "y": "9", 

310 "z": "9", 

311 } 

312 return "".join(char2number.get(c, c) for c in phone.lower()) 

313 

314 

315def _get_random_filename(max_random_bytes): 

316 return b"a" * secrets.randbelow(max_random_bytes) 

317 

318 

319def compress_string(s, *, max_random_bytes=None): 

320 compressed_data = gzip_compress(s, compresslevel=6, mtime=0) 

321 

322 if not max_random_bytes: 

323 return compressed_data 

324 

325 compressed_view = memoryview(compressed_data) 

326 header = bytearray(compressed_view[:10]) 

327 header[3] = gzip.FNAME 

328 

329 filename = _get_random_filename(max_random_bytes) + b"\x00" 

330 

331 return bytes(header) + filename + compressed_view[10:] 

332 

333 

334class StreamingBuffer(BytesIO): 

335 def read(self): 

336 ret = self.getvalue() 

337 self.seek(0) 

338 self.truncate() 

339 return ret 

340 

341 

342# Like compress_string, but for iterators of strings. 

343def compress_sequence(sequence, *, max_random_bytes=None): 

344 buf = StreamingBuffer() 

345 filename = _get_random_filename(max_random_bytes) if max_random_bytes else None 

346 with GzipFile( 

347 filename=filename, mode="wb", compresslevel=6, fileobj=buf, mtime=0 

348 ) as zfile: 

349 # Output headers... 

350 yield buf.read() 

351 for item in sequence: 

352 zfile.write(item) 

353 data = buf.read() 

354 if data: 

355 yield data 

356 yield buf.read() 

357 

358 

359# Expression to match some_token and some_token="with spaces" (and similarly 

360# for single-quoted strings). 

361smart_split_re = _lazy_re_compile( 

362 r""" 

363 ((?: 

364 [^\s'"]* 

365 (?: 

366 (?:"(?:[^"\\]|\\.)*" | '(?:[^'\\]|\\.)*') 

367 [^\s'"]* 

368 )+ 

369 ) | \S+) 

370""", 

371 re.VERBOSE, 

372) 

373 

374 

375def smart_split(text): 

376 r""" 

377 Generator that splits a string by spaces, leaving quoted phrases together. 

378 Supports both single and double quotes, and supports escaping quotes with 

379 backslashes. In the output, strings will keep their initial and trailing 

380 quote marks and escaped quotes will remain escaped (the results can then 

381 be further processed with unescape_string_literal()). 

382 

383 >>> list(smart_split(r'This is "a person\'s" test.')) 

384 ['This', 'is', '"a person\\\'s"', 'test.'] 

385 >>> list(smart_split(r"Another 'person\'s' test.")) 

386 ['Another', "'person\\'s'", 'test.'] 

387 >>> list(smart_split(r'A "\"funky\" style" test.')) 

388 ['A', '"\\"funky\\" style"', 'test.'] 

389 """ 

390 for bit in smart_split_re.finditer(str(text)): 

391 yield bit[0] 

392 

393 

394@keep_lazy_text 

395def unescape_string_literal(s): 

396 r""" 

397 Convert quoted string literals to unquoted strings with escaped quotes and 

398 backslashes unquoted:: 

399 

400 >>> unescape_string_literal('"abc"') 

401 'abc' 

402 >>> unescape_string_literal("'abc'") 

403 'abc' 

404 >>> unescape_string_literal('"a \"bc\""') 

405 'a "bc"' 

406 >>> unescape_string_literal("'\'ab\' c'") 

407 "'ab' c" 

408 """ 

409 if not s or s[0] not in "\"'" or s[-1] != s[0]: 

410 raise ValueError(f"Not a string literal: {s!r}") 

411 quote = s[0] 

412 return s[1:-1].replace(rf"\{quote}", quote).replace(r"\\", "\\") 

413 

414 

415@keep_lazy_text 

416def slugify(value, allow_unicode=False): 

417 """ 

418 Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated 

419 dashes to single dashes. Remove characters that aren't alphanumerics, 

420 underscores, or hyphens. Convert to lowercase. Also strip leading and 

421 trailing whitespace, dashes, and underscores. 

422 """ 

423 value = str(value) 

424 if allow_unicode: 

425 value = unicodedata.normalize("NFKC", value) 

426 else: 

427 value = ( 

428 unicodedata.normalize("NFKD", value) 

429 .encode("ascii", "ignore") 

430 .decode("ascii") 

431 ) 

432 value = re.sub(r"[^\w\s-]", "", value.lower()) 

433 return re.sub(r"[-\s]+", "-", value).strip("-_") 

434 

435 

436def camel_case_to_spaces(value): 

437 """ 

438 Split CamelCase and convert to lowercase. Strip surrounding whitespace. 

439 """ 

440 return re_camel_case.sub(r" \1", value).strip().lower() 

441 

442 

443def _format_lazy(format_string, *args, **kwargs): 

444 """ 

445 Apply str.format() on 'format_string' where format_string, args, 

446 and/or kwargs might be lazy. 

447 """ 

448 return format_string.format(*args, **kwargs) 

449 

450 

451def pluralize(singular, plural, number): 

452 if number == 1: 

453 return singular 

454 else: 

455 return plural 

456 

457 

458def pluralize_lazy(singular, plural, number): 

459 def _lazy_number_unpickle(func, resultclass, number, kwargs): 

460 return lazy_number(func, resultclass, number=number, **kwargs) 

461 

462 def lazy_number(func, resultclass, number=None, **kwargs): 

463 if isinstance(number, int): 

464 kwargs["number"] = number 

465 proxy = lazy(func, resultclass)(**kwargs) 

466 else: 

467 original_kwargs = kwargs.copy() 

468 

469 class NumberAwareString(resultclass): 

470 def __bool__(self): 

471 return bool(kwargs["singular"]) 

472 

473 def _get_number_value(self, values): 

474 try: 

475 return values[number] 

476 except KeyError: 

477 raise KeyError( 

478 f"Your dictionary lacks key '{number}'. Please provide " 

479 "it, because it is required to determine whether " 

480 "string is singular or plural." 

481 ) 

482 

483 def _translate(self, number_value): 

484 kwargs["number"] = number_value 

485 return func(**kwargs) 

486 

487 def format(self, *args, **kwargs): 

488 number_value = ( 

489 self._get_number_value(kwargs) if kwargs and number else args[0] 

490 ) 

491 return self._translate(number_value).format(*args, **kwargs) 

492 

493 def __mod__(self, rhs): 

494 if isinstance(rhs, dict) and number: 

495 number_value = self._get_number_value(rhs) 

496 else: 

497 number_value = rhs 

498 translated = self._translate(number_value) 

499 try: 

500 translated %= rhs 

501 except TypeError: 

502 # String doesn't contain a placeholder for the number. 

503 pass 

504 return translated 

505 

506 proxy = lazy(lambda **kwargs: NumberAwareString(), NumberAwareString)( 

507 **kwargs 

508 ) 

509 proxy.__reduce__ = lambda: ( 

510 _lazy_number_unpickle, 

511 (func, resultclass, number, original_kwargs), 

512 ) 

513 return proxy 

514 

515 return lazy_number(pluralize, str, singular=singular, plural=plural, number=number) 

516 

517 

518format_lazy = lazy(_format_lazy, str)