Coverage for /Users/davegaeddert/Developer/dropseed/plain/plain/plain/utils/text.py: 27%
243 statements
« prev ^ index » next coverage.py v7.6.9, created at 2024-12-23 11:16 -0600
« prev ^ index » next coverage.py v7.6.9, created at 2024-12-23 11:16 -0600
1import gzip
2import re
3import secrets
4import unicodedata
5from gzip import GzipFile
6from gzip import compress as gzip_compress
7from io import BytesIO
9from plain.exceptions import SuspiciousFileOperation
10from plain.utils.functional import SimpleLazyObject, keep_lazy_text, lazy
11from plain.utils.regex_helper import _lazy_re_compile
14@keep_lazy_text
15def capfirst(x):
16 """Capitalize the first letter of a string."""
17 if not x:
18 return x
19 if not isinstance(x, str):
20 x = str(x)
21 return x[0].upper() + x[1:]
24# Set up regular expressions
25re_words = _lazy_re_compile(r"<[^>]+?>|([^<>\s]+)", re.S)
26re_chars = _lazy_re_compile(r"<[^>]+?>|(.)", re.S)
27re_tag = _lazy_re_compile(r"<(/)?(\S+?)(?:(\s*/)|\s.*?)?>", re.S)
28re_newlines = _lazy_re_compile(r"\r\n|\r") # Used in normalize_newlines
29re_camel_case = _lazy_re_compile(r"(((?<=[a-z])[A-Z])|([A-Z](?![A-Z]|$)))")
32@keep_lazy_text
33def wrap(text, width):
34 """
35 A word-wrap function that preserves existing line breaks. Expects that
36 existing line breaks are posix newlines.
38 Preserve all white space except added line breaks consume the space on
39 which they break the line.
41 Don't wrap long words, thus the output text may have lines longer than
42 ``width``.
43 """
45 def _generator():
46 for line in text.splitlines(True): # True keeps trailing linebreaks
47 max_width = min((line.endswith("\n") and width + 1 or width), width)
48 while len(line) > max_width:
49 space = line[: max_width + 1].rfind(" ") + 1
50 if space == 0:
51 space = line.find(" ") + 1
52 if space == 0:
53 yield line
54 line = ""
55 break
56 yield f"{line[: space - 1]}\n"
57 line = line[space:]
58 max_width = min((line.endswith("\n") and width + 1 or width), width)
59 if line:
60 yield line
62 return "".join(_generator())
65class Truncator(SimpleLazyObject):
66 """
67 An object used to truncate text, either by characters or words.
68 """
70 def __init__(self, text):
71 super().__init__(lambda: str(text))
73 def add_truncation_text(self, text, truncate=None):
74 if truncate is None:
75 truncate = "%(truncated_text)s…"
76 if "%(truncated_text)s" in truncate:
77 return truncate % {"truncated_text": text}
78 # The truncation text didn't contain the %(truncated_text)s string
79 # replacement argument so just append it to the text.
80 if text.endswith(truncate):
81 # But don't append the truncation text if the current text already
82 # ends in this.
83 return text
84 return f"{text}{truncate}"
86 def chars(self, num, truncate=None, html=False):
87 """
88 Return the text truncated to be no longer than the specified number
89 of characters.
91 `truncate` specifies what should be used to notify that the string has
92 been truncated, defaulting to a translatable string of an ellipsis.
93 """
94 self._setup()
95 length = int(num)
96 text = unicodedata.normalize("NFC", self._wrapped)
98 # Calculate the length to truncate to (max length - end_text length)
99 truncate_len = length
100 for char in self.add_truncation_text("", truncate):
101 if not unicodedata.combining(char):
102 truncate_len -= 1
103 if truncate_len == 0:
104 break
105 if html:
106 return self._truncate_html(length, truncate, text, truncate_len, False)
107 return self._text_chars(length, truncate, text, truncate_len)
109 def _text_chars(self, length, truncate, text, truncate_len):
110 """Truncate a string after a certain number of chars."""
111 s_len = 0
112 end_index = None
113 for i, char in enumerate(text):
114 if unicodedata.combining(char):
115 # Don't consider combining characters
116 # as adding to the string length
117 continue
118 s_len += 1
119 if end_index is None and s_len > truncate_len:
120 end_index = i
121 if s_len > length:
122 # Return the truncated string
123 return self.add_truncation_text(text[: end_index or 0], truncate)
125 # Return the original string since no truncation was necessary
126 return text
128 def words(self, num, truncate=None, html=False):
129 """
130 Truncate a string after a certain number of words. `truncate` specifies
131 what should be used to notify that the string has been truncated,
132 defaulting to ellipsis.
133 """
134 self._setup()
135 length = int(num)
136 if html:
137 return self._truncate_html(length, truncate, self._wrapped, length, True)
138 return self._text_words(length, truncate)
140 def _text_words(self, length, truncate):
141 """
142 Truncate a string after a certain number of words.
144 Strip newlines in the string.
145 """
146 words = self._wrapped.split()
147 if len(words) > length:
148 words = words[:length]
149 return self.add_truncation_text(" ".join(words), truncate)
150 return " ".join(words)
152 def _truncate_html(self, length, truncate, text, truncate_len, words):
153 """
154 Truncate HTML to a certain number of chars (not counting tags and
155 comments), or, if words is True, then to a certain number of words.
156 Close opened tags if they were correctly closed in the given HTML.
158 Preserve newlines in the HTML.
159 """
160 if words and length <= 0:
161 return ""
163 html4_singlets = (
164 "br",
165 "col",
166 "link",
167 "base",
168 "img",
169 "param",
170 "area",
171 "hr",
172 "input",
173 )
175 # Count non-HTML chars/words and keep note of open tags
176 pos = 0
177 end_text_pos = 0
178 current_len = 0
179 open_tags = []
181 regex = re_words if words else re_chars
183 while current_len <= length:
184 m = regex.search(text, pos)
185 if not m:
186 # Checked through whole string
187 break
188 pos = m.end(0)
189 if m[1]:
190 # It's an actual non-HTML word or char
191 current_len += 1
192 if current_len == truncate_len:
193 end_text_pos = pos
194 continue
195 # Check for tag
196 tag = re_tag.match(m[0])
197 if not tag or current_len >= truncate_len:
198 # Don't worry about non tags or tags after our truncate point
199 continue
200 closing_tag, tagname, self_closing = tag.groups()
201 # Element names are always case-insensitive
202 tagname = tagname.lower()
203 if self_closing or tagname in html4_singlets:
204 pass
205 elif closing_tag:
206 # Check for match in open tags list
207 try:
208 i = open_tags.index(tagname)
209 except ValueError:
210 pass
211 else:
212 # SGML: An end tag closes, back to the matching start tag,
213 # all unclosed intervening start tags with omitted end tags
214 open_tags = open_tags[i + 1 :]
215 else:
216 # Add it to the start of the open tags list
217 open_tags.insert(0, tagname)
219 if current_len <= length:
220 return text
221 out = text[:end_text_pos]
222 truncate_text = self.add_truncation_text("", truncate)
223 if truncate_text:
224 out += truncate_text
225 # Close any tags still open
226 for tag in open_tags:
227 out += f"</{tag}>"
228 # Return string
229 return out
232@keep_lazy_text
233def get_valid_filename(name):
234 """
235 Return the given string converted to a string that can be used for a clean
236 filename. Remove leading and trailing spaces; convert other spaces to
237 underscores; and remove anything that is not an alphanumeric, dash,
238 underscore, or dot.
239 >>> get_valid_filename("john's portrait in 2004.jpg")
240 'johns_portrait_in_2004.jpg'
241 """
242 s = str(name).strip().replace(" ", "_")
243 s = re.sub(r"(?u)[^-\w.]", "", s)
244 if s in {"", ".", ".."}:
245 raise SuspiciousFileOperation(f"Could not derive file name from '{name}'")
246 return s
249@keep_lazy_text
250def get_text_list(list_, last_word="or"):
251 """
252 >>> get_text_list(['a', 'b', 'c', 'd'])
253 'a, b, c or d'
254 >>> get_text_list(['a', 'b', 'c'], 'and')
255 'a, b and c'
256 >>> get_text_list(['a', 'b'], 'and')
257 'a and b'
258 >>> get_text_list(['a'])
259 'a'
260 >>> get_text_list([])
261 ''
262 """
263 if not list_:
264 return ""
265 if len(list_) == 1:
266 return str(list_[0])
267 return "{} {} {}".format(
268 # Translators: This string is used as a separator between list elements
269 ", ".join(str(i) for i in list_[:-1]),
270 str(last_word),
271 str(list_[-1]),
272 )
275@keep_lazy_text
276def normalize_newlines(text):
277 """Normalize CRLF and CR newlines to just LF."""
278 return re_newlines.sub("\n", str(text))
281@keep_lazy_text
282def phone2numeric(phone):
283 """Convert a phone number with letters into its numeric equivalent."""
284 char2number = {
285 "a": "2",
286 "b": "2",
287 "c": "2",
288 "d": "3",
289 "e": "3",
290 "f": "3",
291 "g": "4",
292 "h": "4",
293 "i": "4",
294 "j": "5",
295 "k": "5",
296 "l": "5",
297 "m": "6",
298 "n": "6",
299 "o": "6",
300 "p": "7",
301 "q": "7",
302 "r": "7",
303 "s": "7",
304 "t": "8",
305 "u": "8",
306 "v": "8",
307 "w": "9",
308 "x": "9",
309 "y": "9",
310 "z": "9",
311 }
312 return "".join(char2number.get(c, c) for c in phone.lower())
315def _get_random_filename(max_random_bytes):
316 return b"a" * secrets.randbelow(max_random_bytes)
319def compress_string(s, *, max_random_bytes=None):
320 compressed_data = gzip_compress(s, compresslevel=6, mtime=0)
322 if not max_random_bytes:
323 return compressed_data
325 compressed_view = memoryview(compressed_data)
326 header = bytearray(compressed_view[:10])
327 header[3] = gzip.FNAME
329 filename = _get_random_filename(max_random_bytes) + b"\x00"
331 return bytes(header) + filename + compressed_view[10:]
334class StreamingBuffer(BytesIO):
335 def read(self):
336 ret = self.getvalue()
337 self.seek(0)
338 self.truncate()
339 return ret
342# Like compress_string, but for iterators of strings.
343def compress_sequence(sequence, *, max_random_bytes=None):
344 buf = StreamingBuffer()
345 filename = _get_random_filename(max_random_bytes) if max_random_bytes else None
346 with GzipFile(
347 filename=filename, mode="wb", compresslevel=6, fileobj=buf, mtime=0
348 ) as zfile:
349 # Output headers...
350 yield buf.read()
351 for item in sequence:
352 zfile.write(item)
353 data = buf.read()
354 if data:
355 yield data
356 yield buf.read()
359# Expression to match some_token and some_token="with spaces" (and similarly
360# for single-quoted strings).
361smart_split_re = _lazy_re_compile(
362 r"""
363 ((?:
364 [^\s'"]*
365 (?:
366 (?:"(?:[^"\\]|\\.)*" | '(?:[^'\\]|\\.)*')
367 [^\s'"]*
368 )+
369 ) | \S+)
370""",
371 re.VERBOSE,
372)
375def smart_split(text):
376 r"""
377 Generator that splits a string by spaces, leaving quoted phrases together.
378 Supports both single and double quotes, and supports escaping quotes with
379 backslashes. In the output, strings will keep their initial and trailing
380 quote marks and escaped quotes will remain escaped (the results can then
381 be further processed with unescape_string_literal()).
383 >>> list(smart_split(r'This is "a person\'s" test.'))
384 ['This', 'is', '"a person\\\'s"', 'test.']
385 >>> list(smart_split(r"Another 'person\'s' test."))
386 ['Another', "'person\\'s'", 'test.']
387 >>> list(smart_split(r'A "\"funky\" style" test.'))
388 ['A', '"\\"funky\\" style"', 'test.']
389 """
390 for bit in smart_split_re.finditer(str(text)):
391 yield bit[0]
394@keep_lazy_text
395def unescape_string_literal(s):
396 r"""
397 Convert quoted string literals to unquoted strings with escaped quotes and
398 backslashes unquoted::
400 >>> unescape_string_literal('"abc"')
401 'abc'
402 >>> unescape_string_literal("'abc'")
403 'abc'
404 >>> unescape_string_literal('"a \"bc\""')
405 'a "bc"'
406 >>> unescape_string_literal("'\'ab\' c'")
407 "'ab' c"
408 """
409 if not s or s[0] not in "\"'" or s[-1] != s[0]:
410 raise ValueError(f"Not a string literal: {s!r}")
411 quote = s[0]
412 return s[1:-1].replace(rf"\{quote}", quote).replace(r"\\", "\\")
415@keep_lazy_text
416def slugify(value, allow_unicode=False):
417 """
418 Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
419 dashes to single dashes. Remove characters that aren't alphanumerics,
420 underscores, or hyphens. Convert to lowercase. Also strip leading and
421 trailing whitespace, dashes, and underscores.
422 """
423 value = str(value)
424 if allow_unicode:
425 value = unicodedata.normalize("NFKC", value)
426 else:
427 value = (
428 unicodedata.normalize("NFKD", value)
429 .encode("ascii", "ignore")
430 .decode("ascii")
431 )
432 value = re.sub(r"[^\w\s-]", "", value.lower())
433 return re.sub(r"[-\s]+", "-", value).strip("-_")
436def camel_case_to_spaces(value):
437 """
438 Split CamelCase and convert to lowercase. Strip surrounding whitespace.
439 """
440 return re_camel_case.sub(r" \1", value).strip().lower()
443def _format_lazy(format_string, *args, **kwargs):
444 """
445 Apply str.format() on 'format_string' where format_string, args,
446 and/or kwargs might be lazy.
447 """
448 return format_string.format(*args, **kwargs)
451def pluralize(singular, plural, number):
452 if number == 1:
453 return singular
454 else:
455 return plural
458def pluralize_lazy(singular, plural, number):
459 def _lazy_number_unpickle(func, resultclass, number, kwargs):
460 return lazy_number(func, resultclass, number=number, **kwargs)
462 def lazy_number(func, resultclass, number=None, **kwargs):
463 if isinstance(number, int):
464 kwargs["number"] = number
465 proxy = lazy(func, resultclass)(**kwargs)
466 else:
467 original_kwargs = kwargs.copy()
469 class NumberAwareString(resultclass):
470 def __bool__(self):
471 return bool(kwargs["singular"])
473 def _get_number_value(self, values):
474 try:
475 return values[number]
476 except KeyError:
477 raise KeyError(
478 f"Your dictionary lacks key '{number}'. Please provide "
479 "it, because it is required to determine whether "
480 "string is singular or plural."
481 )
483 def _translate(self, number_value):
484 kwargs["number"] = number_value
485 return func(**kwargs)
487 def format(self, *args, **kwargs):
488 number_value = (
489 self._get_number_value(kwargs) if kwargs and number else args[0]
490 )
491 return self._translate(number_value).format(*args, **kwargs)
493 def __mod__(self, rhs):
494 if isinstance(rhs, dict) and number:
495 number_value = self._get_number_value(rhs)
496 else:
497 number_value = rhs
498 translated = self._translate(number_value)
499 try:
500 translated %= rhs
501 except TypeError:
502 # String doesn't contain a placeholder for the number.
503 pass
504 return translated
506 proxy = lazy(lambda **kwargs: NumberAwareString(), NumberAwareString)(
507 **kwargs
508 )
509 proxy.__reduce__ = lambda: (
510 _lazy_number_unpickle,
511 (func, resultclass, number, original_kwargs),
512 )
513 return proxy
515 return lazy_number(pluralize, str, singular=singular, plural=plural, number=number)
518format_lazy = lazy(_format_lazy, str)