Module src.jsonid.jsonid

jsonid entry-point.

Functions

async def create_manifest(path: str) ‑> list[str]
Expand source code
async def create_manifest(path: str) -> list[str]:
    """Get a list of paths to process."""
    paths = []
    for root, _, files in os.walk(path):
        for file in files:
            file_path = os.path.join(root, file)
            logger.debug(file_path)
            paths.append(file_path)
    return paths

Get a list of paths to process.

def decode(content: str)
Expand source code
def decode(content: str):
    """Decode the given content stream."""
    data = ""
    try:
        data = json.loads(content)
    except json.decoder.JSONDecodeError as err:
        logger.debug("can't process: %s", err)
        return False, None
    return True, data

Decode the given content stream.

def get_date_time() ‑> str
Expand source code
def get_date_time() -> str:
    """Return a datetime string for now(),"""
    return datetime.datetime.now(timezone.utc).strftime(version.UTC_TIME_FORMAT)

Return a datetime string for now(),

async def identify_json(paths: list[str], binary: bool)
Expand source code
async def identify_json(paths: list[str], binary: bool):
    """Identify objects"""
    for idx, path in enumerate(paths):
        valid, data = await identify_plaintext_bytestream(path)
        if not valid:
            logger.debug("%s: is not plaintext", path)
            if binary:
                logger.warning("report on binary object...")
            continue
        if data != "":
            logger.debug("processing: %s", path)
            if idx == 0:
                print("---")
                print(version_header())
                print("---")
            res = registry.matcher(data)
            print(f"file: {path}")
            print("identifiers:")
            for item in res:
                print("  ", item)
            print("---")

Identify objects

def main() ‑> None
Expand source code
def main() -> None:
    """Primary entry point for this script."""
    parser = argparse.ArgumentParser(
        prog="json-id",
        description="proof-of-concept identifier for JSON objects on disk based on identifying valid objects and their key-values",
        epilog="for more information visit https://github.com/ffdev-info/json-id",
    )
    parser.add_argument(
        "--debug",
        help="use debug loggng",
        required=False,
        action="store_true",
    )
    parser.add_argument(
        "--path",
        help="file path to process",
        required=True,
    )
    parser.add_argument(
        "--binary",
        help="report on binary formats as well as plaintext",
        required=False,
        action="store_true",
    )
    parser.add_argument(
        "--registry",
        help="path to a custom registry to lead into memory replacing the default",
        required=False,
    )
    parser.add_argument(
        "--pronom",
        help="return a PRONOM-centric view of the results",
        required=False,
    )
    parser.add_argument(
        "--export",
        help="export the embedded registry",
        required=False,
    )
    parser.add_argument(
        "--language",
        help="return results in different languages",
        required=False,
    )
    args = parser.parse_args()
    logging.getLogger(__name__).setLevel(logging.DEBUG if args.debug else logging.INFO)
    logger.debug("debug logging is configured")
    if args.registry:
        raise NotImplementedError("custom registry is not yet available")
    if args.pronom:
        raise NotImplementedError("pronom view is not yet implemented")
    if args.language:
        raise NotImplementedError("multiple languages are not yet implemented")
    if args.export:
        raise NotImplementedError("registry export is not yet implemented")
    asyncio.run(
        process_data(
            path=args.path,
            binary=args.binary,
        )
    )

Primary entry point for this script.

async def process_data(path: str, binary: bool)
Expand source code
async def process_data(path: str, binary: bool):
    """Process all objects at a given path"""
    logger.debug("processing: %s", path)
    if not os.path.exists(path):
        logger.error("path: '%s' does not exist", path)
        sys.exit(1)
    if os.path.isfile(path):
        await identify_json([path], binary)
        sys.exit(0)
    paths = await create_manifest(path)
    if not paths:
        logger.info("no files in directory: %s", path)
        sys.exit(1)
    await identify_json(paths, binary)

Process all objects at a given path

def version_header() ‑> str
Expand source code
def version_header() -> str:
    """Output a formatted version header."""
    return f"""jsonid: {version.get_version()}
scandate: {get_date_time()}""".strip()

Output a formatted version header.