import os
import re
import json
ROOT = r"C:\Users\Bodia\OneDrive\Робочий стіл\myipnow-main"
REPORT = "faq_schema_validation_report.txt"
def extract_faq_content(html):
"""Detect visible FAQ pairs from accordion HTML."""
q_pattern = r'
(.*?)
'
a_pattern = r'(.*?)
'
questions = re.findall(q_pattern, html, flags=re.DOTALL)
answers = re.findall(a_pattern, html, flags=re.DOTALL)
faqs = []
for i in range(min(len(questions), len(answers))):
q = re.sub(r"<.*?>", "", questions[i]).strip()
a = re.sub(r"<.*?>", "", answers[i]).strip()
faqs.append((q, a))
return faqs
def extract_schema_faq(html):
"""Detect FAQ inside JSON-LD blocks."""
schemas = re.findall(r'', html, re.DOTALL)
faq_list = []
for schema in schemas:
try:
data = json.loads(schema)
# support graph-style schemas
if "@graph" in data:
for entry in data["@graph"]:
if entry.get("@type") in ["FAQPage", "Question"]:
if entry.get("mainEntity"):
for faq in entry["mainEntity"]:
if faq.get("@type") == "Question":
q = faq.get("name", "").strip()
a = faq.get("acceptedAnswer", {}).get("text", "").strip()
faq_list.append((q, a))
elif data.get("@type") == "FAQPage":
for faq in data.get("mainEntity", []):
if faq.get("@type") == "Question":
q = faq.get("name", "").strip()
a = faq.get("acceptedAnswer", {}).get("text", "").strip()
faq_list.append((q, a))
except:
pass
return faq_list
def run():
report = []
for root, _, files in os.walk(ROOT):
for file in files:
if not file.endswith(".html"):
continue
path = os.path.join(root, file)
rel = os.path.relpath(path, ROOT).replace("\\", "/")
with open(path, "r", encoding="utf-8", errors="ignore") as f:
html = f.read()
visible_faq = extract_faq_content(html)
schema_faq = extract_schema_faq(html)
status = []
if visible_faq and not schema_faq:
status.append("❌ FAQ visible but NOT in schema")
elif schema_faq and not visible_faq:
status.append("⚠ Schema contains FAQ but no visible FAQ (Google may remove rich result)")
elif visible_faq and schema_faq:
if len(visible_faq) == len(schema_faq):
status.append("✔ FAQ schema matches visible content count")
else:
status.append(f"⚠ FAQ mismatch — schema({len(schema_faq)}) vs content({len(visible_faq)})")
else:
status.append("— No FAQ present (expected for legal/about pages)")
report.append(f"""
━━━━━━━━━━━━━━━━━━━━━━━━━━
📄 FILE: {rel}
━━━━━━━━━━━━━━━━━━━━━━━━━━
Visible FAQ count: {len(visible_faq)}
Schema FAQ count: {len(schema_faq)}
Status: {'; '.join(status)}
""")
with open(REPORT, "w", encoding="utf-8") as f:
f.write("\n".join(report))
print(f"✅ DONE — Report saved as {REPORT}")
if __name__ == "__main__":
run()