src.co_tools.co_fasta

 1import os
 2import re
 3from glob import glob
 4from pathlib import Path
 5from pathlib import PurePath
 6
 7if os.getenv("CO_LOG").lower() == "true":
 8    from .get_logger import LOGGER
 9
10    log = LOGGER
11else:
12    import logging
13
14    log = logging.getLogger(__name__)
15
16FASTA_EXTENSIONS = [".fa", ".fna", ".ffn", ".frn", ".fasta", ".faa"]
17ALL_SUFFIXES = FASTA_EXTENSIONS + [".gz", ".bgz"]
18
19
20def find_extension(input_file: str):
21    if isinstance(input_file, PurePath):
22        log.error(f"input_path {input_file} must not be a pathlib.PurePath object")
23        return ""
24    input_file_ext = re.sub(r".*\.f", r"\.f", input_file)
25    suffixes = Path(input_file_ext).suffixes
26    log.debug(f"Suffixes: {suffixes}")
27    if mismatch_suffix := set(suffixes) - set(ALL_SUFFIXES):
28        log.info(f"Suffix {mismatch_suffix} not allowed.")
29    else:
30        matching_suffix = set(suffixes) & set(FASTA_EXTENSIONS)
31        if len(matching_suffix) == 1:
32            log.info(f"Matched fasta file {input_file}")
33            return input_file
34    return ""
35
36
37def find_fasta_file(input_path: str):
38    if isinstance(input_path, PurePath):
39        log.error(f"input_path {input_path} must not be a pathlib.PurePath object")
40        return ""
41    if input_files := glob(f"{input_path}/**/*.f*", recursive=True):
42        log.debug(f"Found possible fasta matches: {input_files}")
43    else:
44        log.warning(f"No input files found in {input_path}")
45        return ""
46
47    matched_files = []
48
49    for input_file in input_files:
50        log.debug(f"Input file: {input_file}")
51        fasta_file = find_extension(input_file)
52        if fasta_file:
53            matched_files.append(fasta_file)
54    if len(matched_files) > 1:
55        log.warning(f"More than one fasta file matched! Returning {matched_files[0]}")
56        return matched_files[0]
57    elif len(matched_files) == 1:
58        log.info(f"Matched {matched_files[0]}")
59        return matched_files[0]
60    else:
61        log.warning("Unable to find matching fasta file.")
62        return ""
def find_extension(input_file: str):
21def find_extension(input_file: str):
22    if isinstance(input_file, PurePath):
23        log.error(f"input_path {input_file} must not be a pathlib.PurePath object")
24        return ""
25    input_file_ext = re.sub(r".*\.f", r"\.f", input_file)
26    suffixes = Path(input_file_ext).suffixes
27    log.debug(f"Suffixes: {suffixes}")
28    if mismatch_suffix := set(suffixes) - set(ALL_SUFFIXES):
29        log.info(f"Suffix {mismatch_suffix} not allowed.")
30    else:
31        matching_suffix = set(suffixes) & set(FASTA_EXTENSIONS)
32        if len(matching_suffix) == 1:
33            log.info(f"Matched fasta file {input_file}")
34            return input_file
35    return ""
def find_fasta_file(input_path: str):
38def find_fasta_file(input_path: str):
39    if isinstance(input_path, PurePath):
40        log.error(f"input_path {input_path} must not be a pathlib.PurePath object")
41        return ""
42    if input_files := glob(f"{input_path}/**/*.f*", recursive=True):
43        log.debug(f"Found possible fasta matches: {input_files}")
44    else:
45        log.warning(f"No input files found in {input_path}")
46        return ""
47
48    matched_files = []
49
50    for input_file in input_files:
51        log.debug(f"Input file: {input_file}")
52        fasta_file = find_extension(input_file)
53        if fasta_file:
54            matched_files.append(fasta_file)
55    if len(matched_files) > 1:
56        log.warning(f"More than one fasta file matched! Returning {matched_files[0]}")
57        return matched_files[0]
58    elif len(matched_files) == 1:
59        log.info(f"Matched {matched_files[0]}")
60        return matched_files[0]
61    else:
62        log.warning("Unable to find matching fasta file.")
63        return ""