Source code for urlcheck_smith.core.extract

# src/urlcheck_smith/core/extract.py
from __future__ import annotations

import csv
import hashlib
import re
from pathlib import Path
from typing import Iterable, List, Set, Iterator
from urllib.parse import urlparse, urlunparse

HTTPS_URL_RE = re.compile(r"https://[^\s,\"'<>]+", re.IGNORECASE)

# logging.basicConfig(
#     level=logging.INFO,
#     format="%(levelname)s: %(message)s",
# )

from urlextract import URLExtract

from ..models import UrlRecord

# Initialize the extractor once. 
# It handles TLD updates and complex character matching internally.
_EXTRACTOR = URLExtract()


[docs] def normalize_url(url: str) -> str: """ Standardize the URL to prevent duplicate checks of the same resource. """ try: # urlextract might return URLs with trailing punctuation if not careful. # We strip common trailing noise before parsing. url = url.rstrip('.,);]') # urlextract might return URLs without schemes (e.g., 'google.com'). # urlparse needs a scheme to identify the netloc correctly. temp_url = url if "://" in url else f"http://{url}" parsed = urlparse(temp_url) netloc = parsed.netloc.lower() if not netloc: return url normalized = urlunparse(( parsed.scheme.lower() or "http", netloc, parsed.path, parsed.params, parsed.query, '' # Dropping fragments )) return normalized.rstrip('/') except Exception: return url
[docs] def extract_urls_from_text(text: str) -> List[UrlRecord]: """ Extract, clean, and deduplicate URLs from a block of text using urlextract. """ # urlextract handles trailing punctuation and balanced brackets automatically. found = _EXTRACTOR.find_urls(text, only_unique=True) seen: Set[str] = set() records: List[UrlRecord] = [] for raw in found: normalized = normalize_url(raw) if normalized not in seen: seen.add(normalized) records.append(UrlRecord(url=normalized)) return records
[docs] def stream_extract_from_file(path: Path) -> Iterator[UrlRecord]: """ Generator that yields URLs line-by-line to handle large files efficiently. """ seen: Set[str] = set() try: with path.open('r', encoding='utf-8', errors='ignore') as f: for line in f: # We use the internal extractor logic per line for record in extract_urls_from_text(line): if record.url not in seen: seen.add(record.url) yield record except (OSError, UnicodeDecodeError): return
[docs] def extract_urls_from_paths(paths: Iterable[Path]) -> List[UrlRecord]: """ Aggregates URLs from multiple paths using the streaming logic. """ all_records: List[UrlRecord] = [] global_seen: Set[str] = set() for p in paths: for record in stream_extract_from_file(p): if record.url not in global_seen: global_seen.add(record.url) all_records.append(record) return all_records # import logging
[docs] def extract_https_urls(path: Path) -> list[str]: """ Extracts unique HTTPS URLs from a file. Insecure HTTP links are ignored. This function reads the content of a file from the provided path, extracts all HTTP and HTTPS URLs using a predefined regular expression, cleans the URLs by removing trailing characters such as '.', ',', ')', ']', '>', or quotation marks, removes duplicates, and returns a sorted list of unique URLs. This script is made CLI-apps-friendly; e.g., a manual input string will be converted to a Path object to avoid raising errors. Args: path (Path): The path to the file to be processed. Returns: list[str]: A sorted list of unique cleaned HTTP and HTTPS URLs extracted from the file. """ # logging.info("Reading file: %s", path) path = Path(path) text = path.read_text(encoding="utf-8", errors="ignore") raw_urls = HTTPS_URL_RE.findall(text) # logging.info("Regex HTTP(S) matches: %d", len(raw_urls)) cleaned = [ url.strip().rstrip('.,);]>"\'') for url in raw_urls ] # logging.info("After cleaning: %d", len(cleaned)) unique_urls = sorted(set(cleaned)) # logging.info("After dedupe: %d", len(unique_urls)) return unique_urls
[docs] def sha256_hex(s: str) -> str: return hashlib.sha256(s.encode("utf-8")).hexdigest()
[docs] def urls_to_csv(urls: list[str], output_path: Path) -> None: """ Save the URL list to a CSV file with columns: - URL - hashed_URL (SHA-256 hex) Args: urls (list[str]): List of URLs output_path (Path): Output CSV file path """ with output_path.open("w", newline="", encoding="utf-8") as f: writer = csv.writer(f) # header writer.writerow(["URL", "hashed_URL"]) # rows for url in urls: writer.writerow([url, sha256_hex(url)])