Analysis fix v2 (#99)

dayesouza · web-flow · commit c1259180cafa · 2025-09-30T12:09:10.000-03:00
* analysis fix

* fix QTD
diff --git a/app/workflows/query_text_data/variables.py b/app/workflows/query_text_data/variables.py
@@ -22,7 +22,6 @@ def create_session(self, prefix):
         self.query = SessionVariable("", prefix)
         self.anchored_query = SessionVariable("", prefix)
         self.final_report = SessionVariable("", prefix)
-        self.target_chunks_per_cluster = SessionVariable(5, prefix)
         self.claim_search_depth = SessionVariable(10, prefix)
         self.search_type = SessionVariable("Source text", prefix)
         self.net_new_sources = SessionVariable(0, prefix)
diff --git a/app/workflows/query_text_data/workflow.py b/app/workflows/query_text_data/workflow.py
@@ -218,15 +218,7 @@ async def create(sv: SessionVariables, workflow=None):
                         help="If a text chunk is relevant to the query, then adjacent text chunks in the original document may be able to add additional context to the relevant points. The value of this parameter determines how many chunks before and after each relevant text chunk will be evaluated at the end of the process (or `Relevance test budget`) if they are yet to be tested."
                     )
                 st.markdown("##### Answer options")
-                c1, c2, c3, c4,c5 = st.columns(5)
-                with c1:
-                    st.number_input(
-                        "Target chunks per cluster",
-                        value=sv.target_chunks_per_cluster.value,
-                        key=sv.target_chunks_per_cluster.key,
-                        min_value=0,
-                        help="The average number of text chunks to target per cluster, which determines the text chunks that will be evaluated together and in parallel to other clusters. Larger values will generally result in more related text chunks being evaluated in parallel, but may also result in information loss from unprocessed content."
-                    )
+                c2, c3, c4,c5 = st.columns(4)
                 with c2:
                     st.text("")
                     st.text("")
@@ -459,11 +451,11 @@ def on_llm_new_token_commentary(message):
                         with st.spinner("Generating research report..."):
                             if sv.do_live_commentary.value:
                                 await asyncio.gather(
-                                    qtd.answer_query_with_relevant_chunks(sv.target_chunks_per_cluster.value),
+                                    qtd.answer_query_with_relevant_chunks(),
                                     qtd.generate_analysis_commentary()                      
                                 )
                             else:
-                                await qtd.answer_query_with_relevant_chunks(sv.target_chunks_per_cluster.value)
+                                await qtd.answer_query_with_relevant_chunks()
                             st.rerun()
     with report_tab:
         if qtd.stage.value < QueryTextDataStage.QUESTION_ANSWERED.value:
diff --git a/example_notebooks/query_text_data.ipynb b/example_notebooks/query_text_data.ipynb
diff --git a/intelligence_toolkit/query_text_data/answer_builder.py b/intelligence_toolkit/query_text_data/answer_builder.py
@@ -41,32 +41,79 @@ def extract_and_link_chunk_references(text, link=True):
     references = sorted(references)
     return text, references
 
+def _build_theme_summaries_from_commentary(commentary):
+    if commentary is None:
+        return []
+
+    structure = getattr(commentary, "structure", None)
+    if not structure:
+        return []
+
+    themes = structure.get("themes") or {}
+    points = structure.get("points") or {}
+    point_sources = structure.get("point_sources") or {}
+
+    summaries = []
+    for theme_title, point_ids in themes.items():
+        theme_points = []
+        for point_id in point_ids:
+            point_title = points.get(point_id)
+            if not point_title:
+                continue
+
+            sources = point_sources.get(point_id, [])
+            if sources:
+                # Preserve insertion order while ensuring unique references.
+                seen = set()
+                ordered_sources = [str(src) for src in sources if not (src in seen or seen.add(src))]
+                sources_text = ", ".join(ordered_sources)
+                evidence_suffix = f" [source: {sources_text}]"
+            else:
+                evidence_suffix = ""
+
+            theme_points.append(
+                {
+                    "point_title": point_title,
+                    "point_evidence": f"**Source evidence**: {point_title}{evidence_suffix}",
+                    "point_commentary": f"**AI commentary**: {point_title}",
+                }
+            )
+
+        if theme_points:
+            summaries.append(
+                dumps(
+                    {
+                        "theme_title": theme_title,
+                        "theme_points": theme_points,
+                    }
+                )
+            )
+
+    return summaries
+
 async def answer_query(
     ai_configuration,
     query,
     expanded_query,
     processed_chunks,
-    clustered_cids,
-    cid_to_vector,
-    target_chunks_per_cluster
+    commentary,
 ):
-    print(f"Answering query with clustered ids: {clustered_cids}")
+    print(f"Answering query with clustered ids: {commentary.get_clustered_cids()}")
     partitioned_texts = {}
-    for theme, cids in clustered_cids.items():
-        if len(cids) > target_chunks_per_cluster:
-            cluster_to_cids = cluster_cids(cids, cid_to_vector, len(cids) // target_chunks_per_cluster)
-            for cluster, cids in cluster_to_cids.items():
-                partitioned_texts[f"{theme} - topic {cluster}"] = [f"{cid}: {processed_chunks.cid_to_text[cid]}" for cid in cids]
-        else:
-            partitioned_texts[theme] = [f"{cid}: {processed_chunks.cid_to_text[cid]}" for cid in cids]
+    for theme, cids in commentary.get_clustered_cids().items():
+        partitioned_texts[theme] = [f"{cid}: {processed_chunks.cid_to_text[cid]}" for cid in cids]
     net_new_sources = 0
-    batched_summarization_messages = [
-        utils.prepare_messages(
-            prompts.theme_summarization_prompt,
-            {"chunks": texts, "theme": theme, "query": expanded_query},
+
+    summarized_themes_analysis = _build_theme_summaries_from_commentary(commentary)
+    batched_summarization_messages = []
+    for i, (theme, texts) in enumerate(partitioned_texts.items()):
+        previous_themes = list(theme for theme, _ in partitioned_texts.items())[:i] if i > 0 else []
+        batched_summarization_messages.append(
+            utils.prepare_messages(
+                prompts.theme_summarization_prompt,
+                {"chunks": texts, "theme": theme, "previous_themes": previous_themes, "query": expanded_query},
+            )
         )
-        for theme, texts in partitioned_texts.items()
-    ]
 
     summarized_themes = await utils.map_generate_text(
         ai_configuration,
@@ -88,7 +135,7 @@ async def answer_query(
     report, references, matched_chunks = build_report_markdown(
         query,
         expanded_query,
-        summarized_themes,
+        summarized_themes_analysis or summarized_themes,
         report_wrapper,
         processed_chunks.cid_to_text
     )
diff --git a/intelligence_toolkit/query_text_data/api.py b/intelligence_toolkit/query_text_data/api.py
@@ -232,25 +232,19 @@ async def detect_relevant_text_chunks(
 
     async def answer_query_with_relevant_chunks(
         self,
-        target_chunks_per_cluster: int
     ) -> AnswerObject:
         """
         Answer a query with relevant chunks.
 
-        Args:
-            target_chunks_per_cluster (int): The target chunks per cluster
         Returns:
             AnswerObject: The answer object
         """
-        self.target_chunks_per_cluster = target_chunks_per_cluster
         self.answer_object: AnswerObject = await answer_builder.answer_query(
             self.ai_configuration,
             self.query,
             self.expanded_query,
             self.processed_chunks,
-            self.commentary.get_clustered_cids(),
-            self.cid_to_vector,
-            self.target_chunks_per_cluster
+            self.commentary,
         )
         self.stage = QueryTextDataStage.QUESTION_ANSWERED
         return self.answer_object
diff --git a/intelligence_toolkit/query_text_data/commentary.py b/intelligence_toolkit/query_text_data/commentary.py
@@ -44,6 +44,7 @@ def update_analysis(self, chunks: dict[int, str]):
             callbacks=callbacks
         )
         update_obj = loads(updates)
+        self.structure["themes"] = {}
         for u in update_obj["updates"]:
             point_id = u["point_id"]
             point_title = u["point_title"]
diff --git a/intelligence_toolkit/query_text_data/prompts.py b/intelligence_toolkit/query_text_data/prompts.py
@@ -81,6 +81,7 @@
 - "point_evidence": a paragraph, starting with "**Source evidence**:", describing evidence from sources that support or contradict the point, without additional interpretation
 - "point_commentary": a paragraph, starting with "**AI commentary**:", suggesting inferences, implications, or conclusions that could be drawn from the source evidence
 
+Pay attention to previous themes, so don't repeat the same themes or points. If the theme hint is similar to a previous theme, return an empty json object ONLY.
 IMPORTANT: Make theme titles specific and focused to avoid creating duplicate or overlapping themes. If the theme hint suggests a broad category, make your theme title more specific to the actual content found in the sources.
 
 --Query--
@@ -91,6 +92,10 @@
 
 {theme}
 
+--Previous themes--
+
+{previous_themes}
+
 --Source text chunks--
 
 Input text chunks JSON, in the form "<source_id>: <text_chunk>":
@@ -165,11 +170,11 @@
 - Each point MUST contain sufficient concrete details to capture the specific source information only, and not related information
 - If a source relates to an existing point, the source ID MUST be assigned to the existing point ID, rather than creating a new point
 - If the addition of a source to a point warrants a change in point title, the point title MUST be updated
-- Aim for 3-7 themes overall, with an even distribution of points across themes
+- Aim for 2-7 themes overall, with an even distribution of points across themes
 - Points should be assigned to a single theme in a logical sequence that addresses the user query
 - Themes should contain at least two points if possible
 - Order themes in a logical sequence that addresses the user query
-- Output themes need not be the same as input themes and should be regenerated as needed to maintain 3-7 themes overall
+- Output themes need not be the same as input themes and should be regenerated as needed to maintain 2-7 themes overall
 - AVOID creating duplicate or overlapping themes - consolidate similar themes under a single, more comprehensive theme title
 - Before creating a new theme, check if the content could be merged with an existing theme
 - Theme titles should be distinct and non-overlapping - avoid themes that cover the same conceptual territory
diff --git a/intelligence_toolkit/tests/unit/query_text_data/test_answer_builder.py b/intelligence_toolkit/tests/unit/query_text_data/test_answer_builder.py
@@ -0,0 +1,78 @@
+import json
+
+import pytest
+
+from intelligence_toolkit.query_text_data.answer_builder import answer_query
+from intelligence_toolkit.query_text_data.classes import AnswerObject
+
+
+class _DummyCommentary:
+    def __init__(self):
+        self.structure = {
+            "themes": {"Theme Alpha": ["point-1"]},
+            "points": {"point-1": "Point One"},
+            "point_sources": {"point-1": [1]},
+        }
+
+    def get_clustered_cids(self):
+        return {"Theme Alpha": [1]}
+
+
+class _DummyProcessedChunks:
+    cid_to_text = {
+        1: json.dumps(
+            {"title": "Doc", "chunk_id": 1, "text_chunk": "Chunk body"}
+        )
+    }
+
+
+@pytest.mark.asyncio
+async def test_answer_query_returns_report_with_sources(mocker):
+    mocker.patch(
+        "intelligence_toolkit.query_text_data.answer_builder.utils.prepare_messages",
+        side_effect=lambda *args, **kwargs: {"args": args, "kwargs": kwargs},
+    )
+    mocker.patch(
+        "intelligence_toolkit.query_text_data.answer_builder.utils.map_generate_text",
+        new=mocker.AsyncMock(
+            return_value=[
+                json.dumps(
+                    {
+                        "theme_title": "Theme Alpha",
+                        "theme_points": [
+                            {
+                                "point_title": "Point One",
+                                "point_evidence": "**Source evidence**: Point One [source: 1]",
+                                "point_commentary": "**AI commentary**: Point One",
+                            }
+                        ],
+                    }
+                )
+            ]
+        ),
+    )
+    mocker.patch(
+        "intelligence_toolkit.query_text_data.answer_builder.utils.generate_text",
+        return_value=json.dumps(
+            {
+                "answer": "Some answer",
+                "report_title": "Aligned Themes",
+                "report_overview": "Overview text",
+                "report_implications": "Implication text",
+            }
+        ),
+    )
+
+    result = await answer_query(
+        ai_configuration={"model": "test"},
+        query="What happened?",
+        expanded_query="Detailed question",
+        processed_chunks=_DummyProcessedChunks(),
+        commentary=_DummyCommentary(),
+    )
+
+    assert isinstance(result, AnswerObject)
+    assert result.references == [1]
+    assert "[source: [1](#source-1)]" in result.extended_answer
+    assert "#### Source 1" in result.extended_answer
+    assert "Doc (1)" in result.referenced_chunks
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "intelligence-toolkit"
-version = "0.1.0"
+version = "0.1.2"
 description = "Interactive workflows for generating AI intelligence reports from real-world data sources using GPT models"
 authors = [
     {name = "Dayenne Souza", email = "ddesouza@microsoft.com"},
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -44,6 +44,7 @@ def update_analysis(self, chunks: dict[int, str]):`
`44`	`44`	`callbacks=callbacks`
`45`	`45`	`)`
`46`	`46`	`update_obj = loads(updates)`
	`47`	`+ self.structure["themes"] = {}`
`47`	`48`	`for u in update_obj["updates"]:`
`48`	`49`	`point_id = u["point_id"]`
`49`	`50`	`point_title = u["point_title"]`