Source code for urlcheck_smith.core.trust_manager

from pathlib import Path
from urllib.parse import urlparse

from .update_yaml import load_db


[docs] class TrustManager: """A General Purpose URL Auditor for urlcheck-smith integration.""" def __init__(self, override_rules=None, default_tier="TIER_3_GENERAL", db_path=None): self.override_rules = override_rules or [] self.default_tier = default_tier self._db_path = Path(db_path) if db_path is not None else None self._uc_smith_db = load_db(self._db_path) def _reload(self): # Reloading from disk to pick up any changes (e.g. from editor functions) # load_db uses caching/defaults efficiently. self._uc_smith_db = load_db(self._db_path) def _tier_from_category(self, category: str | None) -> str: if category == "government": return "TIER_1_OFFICIAL" if category in {"education", "news", "standards"}: return "TIER_2_RELIABLE" if category == "international": return "TIER_1_OFFICIAL" return "TIER_3_GENERAL"
[docs] def classify_url(self, url: str) -> str: """Classifies a single URL into a trust tier using rules then fallbacks.""" normalized_url = url.lower() if "://" not in normalized_url: normalized_url = f"http://{normalized_url}" parsed = urlparse(normalized_url) hostname = parsed.netloc # Normalize hostname for domain matching domain_only = hostname[4:] if hostname.startswith("www.") else hostname self._reload() # v1.7+ metadata-driven priority metadata = self._uc_smith_db.get("metadata", {}) priority = metadata.get("priority", ["user_defined", "api_audit", "global_rules"]) # Add explicit override as first priority if not specified (legacy behavior) if "override" not in priority: priority = ["override"] + priority for stage in priority: # 1. user_defined if stage == "user_defined": for entry in self._uc_smith_db.get("user_defined", []): entry_name = entry.get("name", "").lower() if entry_name == domain_only or entry_name == hostname or hostname.endswith(f".{entry_name}"): if "trust_tier" in entry: return entry["trust_tier"] return self._tier_from_category(entry.get("category")) # 2. global_rules elif stage == "global_rules": rules = self._uc_smith_db.get("global_rules", []) sorted_rules = sorted(rules, key=lambda x: len(x.get("name", "")), reverse=True) for rule in sorted_rules: name = rule.get("name", "").lower() if not name: continue if hostname == name or domain_only == name or hostname.endswith(f".{name}"): if "trust_tier" in rule: return rule["trust_tier"] return self._tier_from_category(rule.get("category")) # 3. api_audit / discovered_cache elif stage == "api_audit": for entry in self._uc_smith_db.get("discovered_cache", []): if entry.get("name") == domain_only: score = entry.get("credibility_score", 0.5) if score >= 0.8: return "TIER_1_OFFICIAL" if score >= 0.5: return "TIER_2_RELIABLE" return "TIER_3_GENERAL" # 4. explicit override rules elif stage == "override": for rule in self.override_rules: match = False if "domain" in rule: target_domain = rule["domain"].lower() if hostname == target_domain or domain_only == target_domain: match = True elif "suffix" in rule: if hostname.endswith(rule["suffix"].lower()): match = True if match and "trust_tier" in rule: return rule["trust_tier"] if match: return self._tier_from_category(rule.get("category")) return self.default_tier
[docs] def audit_list(self, url_list: list) -> dict: """Processes a list of raw URLs into a categorized report.""" report = {"official": [], "reliable": [], "general": []} for url in url_list: category = self.classify_url(url) if category == "TIER_1_OFFICIAL": report["official"].append(url) elif category == "TIER_2_RELIABLE": report["reliable"].append(url) else: report["general"].append(url) return report