import pprint

import regex as re
from langchain_core.documents import Document

from app.embeddings.pgvector_store import get_vector_store


def build_context(docs: list[Document]) -> str:
    if not docs:
        return ""

    refs = []
    for i, doc in enumerate(docs, start=1):
        refs.append(f"[{i}]:{doc.page_content}")

    return "\n\n".join(refs)


def merge_docs(docs: list[Document]) -> list[Document]:
    """Merge chunks from the same source into one document"""
    if not docs:
        return []

    merged = {}

    context_pattern = re.compile(
        r"(Title: .*?\nTarget Country: .*?\n)(Section Context: .*)", re.DOTALL
    )

    for doc in docs:
        source_id = doc.metadata.get("source_id")

        if source_id not in merged:
            merged[source_id] = {
                "metadata": doc.metadata,
                "header_info": "",
                "sections": [],
            }

        match = context_pattern.search(doc.page_content)

        if match:
            header_global = match.group(1)  # Title + Target Country
            section_content = match.group(2)  # Section Context + Content

            if not merged[source_id]["header_info"]:
                merged[source_id]["header_info"] = header_global

            if section_content not in merged[source_id]["sections"]:
                merged[source_id]["sections"].append(section_content)
        else:
            # Fallback if regex fails
            if doc.page_content not in merged[source_id]["sections"]:
                merged[source_id]["sections"].append(doc.page_content)

    final_docs = []
    for data in merged.values():
        # global header first (Title + Country), section contenxt and chunk content last
        full_text = data["header_info"] + "\n\n".join(data["sections"])
        final_docs.append(Document(page_content=full_text, metadata=data["metadata"]))

    return final_docs


programs = get_vector_store("programs")
filter = []
conditions = []
conditions.append({"countries": {"$eq": "Australia"}})
conditions.append({"degree_level": {"$eq": "Bachelor"}})
retrieval_filter = {"$and": conditions}
retrieved_docs_raw = programs.similarity_search(
    query="BSc Nursing scholarships", k=5, filter=retrieval_filter
)
for doc in retrieved_docs_raw:
    print("Metadata:")
    pprint.pprint(doc.metadata)
    print("Content:")
    print(doc.page_content)

retrieved_docs = merge_docs(retrieved_docs_raw)
context = build_context(retrieved_docs)
print(f"Context used: {context}")
