import os import re import json ROOT = r"C:\Users\Bodia\OneDrive\Робочий стіл\myipnow-main" REPORT = "faq_schema_validation_report.txt" def extract_faq_content(html): """Detect visible FAQ pairs from accordion HTML.""" q_pattern = r'

(.*?)

' a_pattern = r'

(.*?)

' questions = re.findall(q_pattern, html, flags=re.DOTALL) answers = re.findall(a_pattern, html, flags=re.DOTALL) faqs = [] for i in range(min(len(questions), len(answers))): q = re.sub(r"<.*?>", "", questions[i]).strip() a = re.sub(r"<.*?>", "", answers[i]).strip() faqs.append((q, a)) return faqs def extract_schema_faq(html): """Detect FAQ inside JSON-LD blocks.""" schemas = re.findall(r'', html, re.DOTALL) faq_list = [] for schema in schemas: try: data = json.loads(schema) # support graph-style schemas if "@graph" in data: for entry in data["@graph"]: if entry.get("@type") in ["FAQPage", "Question"]: if entry.get("mainEntity"): for faq in entry["mainEntity"]: if faq.get("@type") == "Question": q = faq.get("name", "").strip() a = faq.get("acceptedAnswer", {}).get("text", "").strip() faq_list.append((q, a)) elif data.get("@type") == "FAQPage": for faq in data.get("mainEntity", []): if faq.get("@type") == "Question": q = faq.get("name", "").strip() a = faq.get("acceptedAnswer", {}).get("text", "").strip() faq_list.append((q, a)) except: pass return faq_list def run(): report = [] for root, _, files in os.walk(ROOT): for file in files: if not file.endswith(".html"): continue path = os.path.join(root, file) rel = os.path.relpath(path, ROOT).replace("\\", "/") with open(path, "r", encoding="utf-8", errors="ignore") as f: html = f.read() visible_faq = extract_faq_content(html) schema_faq = extract_schema_faq(html) status = [] if visible_faq and not schema_faq: status.append("❌ FAQ visible but NOT in schema") elif schema_faq and not visible_faq: status.append("⚠ Schema contains FAQ but no visible FAQ (Google may remove rich result)") elif visible_faq and schema_faq: if len(visible_faq) == len(schema_faq): status.append("✔ FAQ schema matches visible content count") else: status.append(f"⚠ FAQ mismatch — schema({len(schema_faq)}) vs content({len(visible_faq)})") else: status.append("— No FAQ present (expected for legal/about pages)") report.append(f""" ━━━━━━━━━━━━━━━━━━━━━━━━━━ 📄 FILE: {rel} ━━━━━━━━━━━━━━━━━━━━━━━━━━ Visible FAQ count: {len(visible_faq)} Schema FAQ count: {len(schema_faq)} Status: {'; '.join(status)} """) with open(REPORT, "w", encoding="utf-8") as f: f.write("\n".join(report)) print(f"✅ DONE — Report saved as {REPORT}") if __name__ == "__main__": run()