Skip to content

Commit c125918

Browse files
authored
Analysis fix v2 (#99)
* analysis fix * fix QTD
1 parent c684cfd commit c125918

File tree

10 files changed

+1114
-541
lines changed

10 files changed

+1114
-541
lines changed

app/workflows/query_text_data/variables.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ def create_session(self, prefix):
2222
self.query = SessionVariable("", prefix)
2323
self.anchored_query = SessionVariable("", prefix)
2424
self.final_report = SessionVariable("", prefix)
25-
self.target_chunks_per_cluster = SessionVariable(5, prefix)
2625
self.claim_search_depth = SessionVariable(10, prefix)
2726
self.search_type = SessionVariable("Source text", prefix)
2827
self.net_new_sources = SessionVariable(0, prefix)

app/workflows/query_text_data/workflow.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -218,15 +218,7 @@ async def create(sv: SessionVariables, workflow=None):
218218
help="If a text chunk is relevant to the query, then adjacent text chunks in the original document may be able to add additional context to the relevant points. The value of this parameter determines how many chunks before and after each relevant text chunk will be evaluated at the end of the process (or `Relevance test budget`) if they are yet to be tested."
219219
)
220220
st.markdown("##### Answer options")
221-
c1, c2, c3, c4,c5 = st.columns(5)
222-
with c1:
223-
st.number_input(
224-
"Target chunks per cluster",
225-
value=sv.target_chunks_per_cluster.value,
226-
key=sv.target_chunks_per_cluster.key,
227-
min_value=0,
228-
help="The average number of text chunks to target per cluster, which determines the text chunks that will be evaluated together and in parallel to other clusters. Larger values will generally result in more related text chunks being evaluated in parallel, but may also result in information loss from unprocessed content."
229-
)
221+
c2, c3, c4,c5 = st.columns(4)
230222
with c2:
231223
st.text("")
232224
st.text("")
@@ -459,11 +451,11 @@ def on_llm_new_token_commentary(message):
459451
with st.spinner("Generating research report..."):
460452
if sv.do_live_commentary.value:
461453
await asyncio.gather(
462-
qtd.answer_query_with_relevant_chunks(sv.target_chunks_per_cluster.value),
454+
qtd.answer_query_with_relevant_chunks(),
463455
qtd.generate_analysis_commentary()
464456
)
465457
else:
466-
await qtd.answer_query_with_relevant_chunks(sv.target_chunks_per_cluster.value)
458+
await qtd.answer_query_with_relevant_chunks()
467459
st.rerun()
468460
with report_tab:
469461
if qtd.stage.value < QueryTextDataStage.QUESTION_ANSWERED.value:

example_notebooks/query_text_data.ipynb

Lines changed: 957 additions & 500 deletions
Large diffs are not rendered by default.

intelligence_toolkit/query_text_data/answer_builder.py

Lines changed: 65 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -41,32 +41,79 @@ def extract_and_link_chunk_references(text, link=True):
4141
references = sorted(references)
4242
return text, references
4343

44+
def _build_theme_summaries_from_commentary(commentary):
45+
if commentary is None:
46+
return []
47+
48+
structure = getattr(commentary, "structure", None)
49+
if not structure:
50+
return []
51+
52+
themes = structure.get("themes") or {}
53+
points = structure.get("points") or {}
54+
point_sources = structure.get("point_sources") or {}
55+
56+
summaries = []
57+
for theme_title, point_ids in themes.items():
58+
theme_points = []
59+
for point_id in point_ids:
60+
point_title = points.get(point_id)
61+
if not point_title:
62+
continue
63+
64+
sources = point_sources.get(point_id, [])
65+
if sources:
66+
# Preserve insertion order while ensuring unique references.
67+
seen = set()
68+
ordered_sources = [str(src) for src in sources if not (src in seen or seen.add(src))]
69+
sources_text = ", ".join(ordered_sources)
70+
evidence_suffix = f" [source: {sources_text}]"
71+
else:
72+
evidence_suffix = ""
73+
74+
theme_points.append(
75+
{
76+
"point_title": point_title,
77+
"point_evidence": f"**Source evidence**: {point_title}{evidence_suffix}",
78+
"point_commentary": f"**AI commentary**: {point_title}",
79+
}
80+
)
81+
82+
if theme_points:
83+
summaries.append(
84+
dumps(
85+
{
86+
"theme_title": theme_title,
87+
"theme_points": theme_points,
88+
}
89+
)
90+
)
91+
92+
return summaries
93+
4494
async def answer_query(
4595
ai_configuration,
4696
query,
4797
expanded_query,
4898
processed_chunks,
49-
clustered_cids,
50-
cid_to_vector,
51-
target_chunks_per_cluster
99+
commentary,
52100
):
53-
print(f"Answering query with clustered ids: {clustered_cids}")
101+
print(f"Answering query with clustered ids: {commentary.get_clustered_cids()}")
54102
partitioned_texts = {}
55-
for theme, cids in clustered_cids.items():
56-
if len(cids) > target_chunks_per_cluster:
57-
cluster_to_cids = cluster_cids(cids, cid_to_vector, len(cids) // target_chunks_per_cluster)
58-
for cluster, cids in cluster_to_cids.items():
59-
partitioned_texts[f"{theme} - topic {cluster}"] = [f"{cid}: {processed_chunks.cid_to_text[cid]}" for cid in cids]
60-
else:
61-
partitioned_texts[theme] = [f"{cid}: {processed_chunks.cid_to_text[cid]}" for cid in cids]
103+
for theme, cids in commentary.get_clustered_cids().items():
104+
partitioned_texts[theme] = [f"{cid}: {processed_chunks.cid_to_text[cid]}" for cid in cids]
62105
net_new_sources = 0
63-
batched_summarization_messages = [
64-
utils.prepare_messages(
65-
prompts.theme_summarization_prompt,
66-
{"chunks": texts, "theme": theme, "query": expanded_query},
106+
107+
summarized_themes_analysis = _build_theme_summaries_from_commentary(commentary)
108+
batched_summarization_messages = []
109+
for i, (theme, texts) in enumerate(partitioned_texts.items()):
110+
previous_themes = list(theme for theme, _ in partitioned_texts.items())[:i] if i > 0 else []
111+
batched_summarization_messages.append(
112+
utils.prepare_messages(
113+
prompts.theme_summarization_prompt,
114+
{"chunks": texts, "theme": theme, "previous_themes": previous_themes, "query": expanded_query},
115+
)
67116
)
68-
for theme, texts in partitioned_texts.items()
69-
]
70117

71118
summarized_themes = await utils.map_generate_text(
72119
ai_configuration,
@@ -88,7 +135,7 @@ async def answer_query(
88135
report, references, matched_chunks = build_report_markdown(
89136
query,
90137
expanded_query,
91-
summarized_themes,
138+
summarized_themes_analysis or summarized_themes,
92139
report_wrapper,
93140
processed_chunks.cid_to_text
94141
)

intelligence_toolkit/query_text_data/api.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -232,25 +232,19 @@ async def detect_relevant_text_chunks(
232232

233233
async def answer_query_with_relevant_chunks(
234234
self,
235-
target_chunks_per_cluster: int
236235
) -> AnswerObject:
237236
"""
238237
Answer a query with relevant chunks.
239238
240-
Args:
241-
target_chunks_per_cluster (int): The target chunks per cluster
242239
Returns:
243240
AnswerObject: The answer object
244241
"""
245-
self.target_chunks_per_cluster = target_chunks_per_cluster
246242
self.answer_object: AnswerObject = await answer_builder.answer_query(
247243
self.ai_configuration,
248244
self.query,
249245
self.expanded_query,
250246
self.processed_chunks,
251-
self.commentary.get_clustered_cids(),
252-
self.cid_to_vector,
253-
self.target_chunks_per_cluster
247+
self.commentary,
254248
)
255249
self.stage = QueryTextDataStage.QUESTION_ANSWERED
256250
return self.answer_object

intelligence_toolkit/query_text_data/commentary.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def update_analysis(self, chunks: dict[int, str]):
4444
callbacks=callbacks
4545
)
4646
update_obj = loads(updates)
47+
self.structure["themes"] = {}
4748
for u in update_obj["updates"]:
4849
point_id = u["point_id"]
4950
point_title = u["point_title"]

intelligence_toolkit/query_text_data/prompts.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@
8181
- "point_evidence": a paragraph, starting with "**Source evidence**:", describing evidence from sources that support or contradict the point, without additional interpretation
8282
- "point_commentary": a paragraph, starting with "**AI commentary**:", suggesting inferences, implications, or conclusions that could be drawn from the source evidence
8383
84+
Pay attention to previous themes, so don't repeat the same themes or points. If the theme hint is similar to a previous theme, return an empty json object ONLY.
8485
IMPORTANT: Make theme titles specific and focused to avoid creating duplicate or overlapping themes. If the theme hint suggests a broad category, make your theme title more specific to the actual content found in the sources.
8586
8687
--Query--
@@ -91,6 +92,10 @@
9192
9293
{theme}
9394
95+
--Previous themes--
96+
97+
{previous_themes}
98+
9499
--Source text chunks--
95100
96101
Input text chunks JSON, in the form "<source_id>: <text_chunk>":
@@ -165,11 +170,11 @@
165170
- Each point MUST contain sufficient concrete details to capture the specific source information only, and not related information
166171
- If a source relates to an existing point, the source ID MUST be assigned to the existing point ID, rather than creating a new point
167172
- If the addition of a source to a point warrants a change in point title, the point title MUST be updated
168-
- Aim for 3-7 themes overall, with an even distribution of points across themes
173+
- Aim for 2-7 themes overall, with an even distribution of points across themes
169174
- Points should be assigned to a single theme in a logical sequence that addresses the user query
170175
- Themes should contain at least two points if possible
171176
- Order themes in a logical sequence that addresses the user query
172-
- Output themes need not be the same as input themes and should be regenerated as needed to maintain 3-7 themes overall
177+
- Output themes need not be the same as input themes and should be regenerated as needed to maintain 2-7 themes overall
173178
- AVOID creating duplicate or overlapping themes - consolidate similar themes under a single, more comprehensive theme title
174179
- Before creating a new theme, check if the content could be merged with an existing theme
175180
- Theme titles should be distinct and non-overlapping - avoid themes that cover the same conceptual territory
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import json
2+
3+
import pytest
4+
5+
from intelligence_toolkit.query_text_data.answer_builder import answer_query
6+
from intelligence_toolkit.query_text_data.classes import AnswerObject
7+
8+
9+
class _DummyCommentary:
10+
def __init__(self):
11+
self.structure = {
12+
"themes": {"Theme Alpha": ["point-1"]},
13+
"points": {"point-1": "Point One"},
14+
"point_sources": {"point-1": [1]},
15+
}
16+
17+
def get_clustered_cids(self):
18+
return {"Theme Alpha": [1]}
19+
20+
21+
class _DummyProcessedChunks:
22+
cid_to_text = {
23+
1: json.dumps(
24+
{"title": "Doc", "chunk_id": 1, "text_chunk": "Chunk body"}
25+
)
26+
}
27+
28+
29+
@pytest.mark.asyncio
30+
async def test_answer_query_returns_report_with_sources(mocker):
31+
mocker.patch(
32+
"intelligence_toolkit.query_text_data.answer_builder.utils.prepare_messages",
33+
side_effect=lambda *args, **kwargs: {"args": args, "kwargs": kwargs},
34+
)
35+
mocker.patch(
36+
"intelligence_toolkit.query_text_data.answer_builder.utils.map_generate_text",
37+
new=mocker.AsyncMock(
38+
return_value=[
39+
json.dumps(
40+
{
41+
"theme_title": "Theme Alpha",
42+
"theme_points": [
43+
{
44+
"point_title": "Point One",
45+
"point_evidence": "**Source evidence**: Point One [source: 1]",
46+
"point_commentary": "**AI commentary**: Point One",
47+
}
48+
],
49+
}
50+
)
51+
]
52+
),
53+
)
54+
mocker.patch(
55+
"intelligence_toolkit.query_text_data.answer_builder.utils.generate_text",
56+
return_value=json.dumps(
57+
{
58+
"answer": "Some answer",
59+
"report_title": "Aligned Themes",
60+
"report_overview": "Overview text",
61+
"report_implications": "Implication text",
62+
}
63+
),
64+
)
65+
66+
result = await answer_query(
67+
ai_configuration={"model": "test"},
68+
query="What happened?",
69+
expanded_query="Detailed question",
70+
processed_chunks=_DummyProcessedChunks(),
71+
commentary=_DummyCommentary(),
72+
)
73+
74+
assert isinstance(result, AnswerObject)
75+
assert result.references == [1]
76+
assert "[source: [1](#source-1)]" in result.extended_answer
77+
assert "#### Source 1" in result.extended_answer
78+
assert "Doc (1)" in result.referenced_chunks

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "intelligence-toolkit"
3-
version = "0.1.0"
3+
version = "0.1.2"
44
description = "Interactive workflows for generating AI intelligence reports from real-world data sources using GPT models"
55
authors = [
66
{name = "Dayenne Souza", email = "ddesouza@microsoft.com"},

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)