itsalissonsilva commited on
Commit
f2542bb
·
verified ·
1 Parent(s): 77cce75

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +180 -282
app.py CHANGED
@@ -1,292 +1,190 @@
1
- import streamlit as st
2
- import pdfplumber
3
  import os
 
4
  import tempfile
5
- from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain_community.vectorstores import FAISS
7
- from langchain_community.embeddings import HuggingFaceEmbeddings
8
- from langchain.llms.base import LLM
9
- from typing import Optional, List, Mapping, Any
10
- import json
11
- import urllib.request
12
-
13
- class AzureEndpointLLM(LLM):
14
- endpoint_url: str
15
- api_key: str
16
-
17
- def __init__(self, **kwargs):
18
- # Pull secrets from env
19
- kwargs["endpoint_url"] = os.environ.get("AZURE_ENDPOINT_URL")
20
- kwargs["api_key"] = os.environ.get("AZURE_API_KEY")
21
-
22
- if not kwargs["endpoint_url"] or not kwargs["api_key"]:
23
- raise ValueError("Missing Azure endpoint URL or API key")
24
-
25
- super().__init__(**kwargs)
26
-
27
- @property
28
- def _llm_type(self) -> str:
29
- return "azure-endpoint-llm"
30
-
31
- def _call(
32
- self,
33
- prompt: str,
34
- stop: Optional[List[str]] = None,
35
- run_manager: Optional[Any] = None,
36
- **kwargs: Any,
37
- ) -> str:
38
- # Format expected by Azure LLM endpoint
39
- input_data = {
40
- "input_data": {
41
- "input_string": [{"role": "user", "content": prompt}],
42
- "parameters": {
43
- "temperature": 0.7,
44
- "max_tokens": 1024
45
- }
46
- }
47
- }
48
 
49
- body = str.encode(json.dumps(input_data))
 
 
50
 
51
- headers = {
52
- 'Content-Type': 'application/json',
53
- 'Accept': 'application/json',
54
- 'Authorization': 'Bearer ' + self.api_key
55
- }
56
 
57
- req = urllib.request.Request(self.endpoint_url, body, headers)
58
- try:
59
- response = urllib.request.urlopen(req)
60
- result = response.read().decode("utf-8")
61
- parsed = json.loads(result)
 
 
 
 
62
 
63
- # Adjust this depending on how Azure returns generated text
64
- if isinstance(parsed, dict) and "output" in parsed:
65
- return parsed["output"]
66
- elif isinstance(parsed, dict):
67
- return json.dumps(parsed)
68
- return parsed
69
- except urllib.error.HTTPError as e:
70
- print("Request failed:", e.code)
71
- print(e.read().decode("utf-8", "ignore"))
72
- return "Oops, there was an error calling the Azure model."
73
-
74
-
75
- class ChatHistory:
76
- def __init__(self):
77
- self.messages = []
78
-
79
- def add_user_message(self, message):
80
- self.messages.append({"role": "user", "content": message})
81
-
82
- def add_assistant_message(self, message, sources=None):
83
- self.messages.append({
84
- "role": "assistant",
85
- "content": message,
86
- "sources": sources if sources else []
87
- })
88
-
89
- def get_conversation_history(self, include_sources=False):
90
- if include_sources:
91
- return self.messages
92
- else:
93
- return [{"role": m["role"], "content": m["content"]} for m in self.messages]
94
-
95
- def get_messages_for_display(self):
96
- return self.messages
97
-
98
- def clear(self):
99
- self.messages = []
100
-
101
- st.set_page_config(page_title="RAG Chat with Azure LLM", page_icon="💬", layout="wide")
102
-
103
- if 'vector_store' not in st.session_state:
104
- st.session_state.vector_store = None
105
- if 'document_processed' not in st.session_state:
106
- st.session_state.document_processed = False
107
- if 'file_name' not in st.session_state:
108
- st.session_state.file_name = None
109
- if 'document_text' not in st.session_state:
110
- st.session_state.document_text = ""
111
- if 'chat_history' not in st.session_state:
112
- st.session_state.chat_history = ChatHistory()
113
-
114
- def extract_text_from_document(document_file):
115
- file_type = document_file.name.split('.')[-1].lower()
116
- if file_type == 'txt':
117
- return document_file.getvalue().decode('utf-8')
118
- elif file_type == 'pdf':
119
- with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
120
- tmp_file.write(document_file.getvalue())
121
- tmp_file_path = tmp_file.name
122
-
123
- text = ""
124
- try:
125
- with pdfplumber.open(tmp_file_path) as pdf:
126
- for page in pdf.pages:
127
- page_text = page.extract_text()
128
- if page_text:
129
- text += page_text + "\n\n"
130
- except Exception as e:
131
- st.error(f"Error extracting text from PDF: {e}")
132
- finally:
133
- if os.path.exists(tmp_file_path):
134
- os.remove(tmp_file_path)
135
- return text
136
- else:
137
- st.error(f"Unsupported file type: {file_type}")
138
- return ""
139
-
140
- def create_chunks(text):
141
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50, length_function=len)
142
- return text_splitter.split_text(text)
143
-
144
- def create_vector_store(chunks):
145
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={'device': 'cpu'})
146
- return FAISS.from_texts(chunks, embeddings)
147
-
148
- def retrieve_relevant_chunks(vector_store, query, k=3):
149
- if not vector_store:
150
- return []
151
- return vector_store.similarity_search(query, k=k)
152
-
153
- def generate_rag_response(query, chat_history, vector_store):
154
- llm = AzureEndpointLLM()
155
- relevant_docs = retrieve_relevant_chunks(vector_store, query, k=3)
156
-
157
- if not relevant_docs:
158
- return "I couldn't find any relevant information in the document to answer your question.", []
159
-
160
- context = "\n\n".join([doc.page_content for doc in relevant_docs])
161
- conversation_history = ""
162
- for msg in chat_history.get_conversation_history():
163
- role = "User" if msg["role"] == "user" else "Assistant"
164
- conversation_history += f"{role}: {msg['content']}\n\n"
165
-
166
- prompt = f"""
167
- You are a helpful assistant that provides accurate information based only on the given context and conversation history.
168
- 1. Use only the context below and the conversation history to answer the question.
169
- 2. If the answer is not in the context, reply with \"I don't have enough information to answer this question.\"
170
- 3. Be friendly and helpful.
171
- 4. Maintain continuity with the conversation history.
172
- Conversation History:
173
- {conversation_history}
174
- Context from document:
175
- {context}
176
- User's question: {query}
177
- Answer:
178
- """
179
-
180
- response = llm(prompt)
181
- return response, relevant_docs
182
-
183
- def process_user_message(user_message):
184
- st.session_state.chat_history.add_user_message(user_message)
185
- with st.spinner("Thinking..."):
186
- response, source_docs = generate_rag_response(user_message, st.session_state.chat_history, st.session_state.vector_store)
187
- sources = [{"id": i + 1, "content": doc.page_content} for i, doc in enumerate(source_docs)]
188
- st.session_state.chat_history.add_assistant_message(response, sources)
189
- return response, sources
190
-
191
- st.title("💬 RAG Chat with Azure Model")
192
- st.markdown("""
193
- Upload a PDF or TXT document and chat about its content. This system uses:
194
- - Document text extraction
195
- - Text chunking and embedding
196
- - Azure LLM for answering questions
197
- - Memory to maintain conversation context
198
- """)
199
-
200
- with st.sidebar:
201
- st.header("Configuration")
202
- uploaded_file = st.file_uploader("Upload a document", type=['pdf', 'txt'])
203
- if st.button("Clear Chat History"):
204
- st.session_state.chat_history.clear()
205
- st.success("Chat history cleared!")
206
- st.markdown("**Using Azure-deployed model**")
207
- st.markdown("---")
208
- st.markdown("### About")
209
  st.markdown("""
210
- This is a RAG Chat system that:
211
- 1. Processes PDF and TXT documents
212
- 2. Creates a vector database of document content
213
- 3. Maintains conversation history
214
- 4. Retrieves relevant information for user queries
215
- 5. Generates contextual answers using your Azure-deployed LLM
216
  """)
217
-
218
- if uploaded_file is not None:
219
- if st.session_state.file_name != uploaded_file.name:
220
- st.session_state.file_name = uploaded_file.name
221
- st.session_state.document_processed = False
222
-
223
- if not st.session_state.document_processed:
224
- with st.spinner(f"Processing {uploaded_file.name.split('.')[-1].upper()} file..."):
225
- text = extract_text_from_document(uploaded_file)
226
- st.session_state.document_text = text
227
- chunks = create_chunks(text)
228
- st.session_state.vector_store = create_vector_store(chunks)
229
- st.session_state.document_processed = True
230
- st.success(f"Document processed successfully: {uploaded_file.name}")
231
- num_chunks = len(chunks)
232
- avg_chunk_size = sum(len(chunk) for chunk in chunks) / num_chunks if num_chunks > 0 else 0
233
- st.info(f"Document processed into {num_chunks} chunks with average size of {avg_chunk_size:.0f} characters")
234
-
235
- col1, col2 = st.columns([3, 1])
236
-
237
- with col1:
238
- st.subheader("Chat")
239
- chat_container = st.container()
240
- with chat_container:
241
- for message in st.session_state.chat_history.get_messages_for_display():
242
- with st.chat_message(message["role"]):
243
- st.markdown(message["content"])
244
- if message["role"] == "assistant" and "sources" in message and message["sources"]:
245
- with st.expander("View Sources"):
246
- for source in message["sources"]:
247
- st.markdown(f"**Source {source['id']}**")
248
- st.text(source["content"])
249
-
250
- if st.session_state.document_processed:
251
- user_input = st.chat_input("Type your message here...")
252
- if user_input:
253
- with st.chat_message("user"):
254
- st.markdown(user_input)
255
- response, sources = process_user_message(user_input)
256
- with st.chat_message("assistant"):
257
- st.markdown(response)
258
- if sources:
259
- with st.expander("View Sources"):
260
- for source in sources:
261
- st.markdown(f"**Source {source['id']}**")
262
- st.text(source["content"])
263
- else:
264
- st.info("Please upload a document to start chatting")
265
-
266
- with col2:
267
- if st.session_state.document_processed:
268
- st.subheader("Document Preview")
269
- with st.expander("View Document Text", expanded=False):
270
- st.text_area(
271
- "Extracted Text",
272
- st.session_state.document_text[:5000] +
273
- ("..." if len(st.session_state.document_text) > 5000 else ""),
274
- height=400)
275
- else:
276
- st.info("Upload a PDF or TXT document to get started")
277
-
278
- if not st.session_state.document_processed:
279
- st.markdown("""
280
- ## Getting Started
281
- 1. **Upload a PDF or TXT document** using the file uploader in the sidebar
282
- 2. Wait for the document to be processed
283
- 3. Start chatting with the AI about the document
284
- 4. The chat remembers the conversation context
285
- 5. Clear the chat history using the button in the sidebar
286
- The system uses an Azure-deployed LLM and maintains conversation memory.
 
 
 
 
 
287
  """)
 
 
 
 
288
 
289
- st.sidebar.info("""
290
- **Using Azure ML Endpoint**
291
- This application calls a large language model deployed on Azure.
292
- """)
 
 
 
1
  import os
2
+ import streamlit as st
3
  import tempfile
4
+ from pathlib import Path
5
+ from typing import Dict, List
6
+
7
+ # Import the gap analysis pipeline - use our new implementation
8
+ from attached_assets.ifrs9_analysis import run_gap_analysis_pipeline, GapAnalysisState
9
+ from attached_assets.file_handler import extract_text_from_file
10
+ from attached_assets.consolidated_analysis import generate_consolidated_analysis, display_consolidated_analysis
11
+ from attached_assets.pdf_generator import generate_gap_analysis_pdf, generate_consolidated_pdf
12
+
13
+ # Configure page
14
+ st.set_page_config(
15
+ page_title="IFRS9 Validation GAP Analysis",
16
+ page_icon="📊",
17
+ layout="wide"
18
+ )
19
+
20
+ # Parameter 'results' is the output of run_gap_analysis_pipeline() which returns GapAnalysisState
21
+ def display_results(results: GapAnalysisState) -> None:
22
+ """Display the GAP analysis results in a structured format."""
23
+ st.header("📝 GAP Analysis Results")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ # Convert results to dict if needed
26
+ queries = results.queries if hasattr(results, 'queries') else results.get('queries', [])
27
+ gap_analyses = results.gap_analyses if hasattr(results, 'gap_analyses') else results.get('gap_analyses', {})
28
 
29
+ if not queries:
30
+ st.error("No queries generated. Please try running the analysis again.")
31
+ return
 
 
32
 
33
+ # Generate consolidated analysis if not already present
34
+ if 'consolidated_analysis' not in st.session_state:
35
+ with st.spinner("Generating consolidated analysis..."):
36
+ try:
37
+ consolidated_analysis = generate_consolidated_analysis(queries, gap_analyses)
38
+ st.session_state.consolidated_analysis = consolidated_analysis
39
+ except Exception as e:
40
+ st.error(f"Error generating consolidated analysis: {str(e)}")
41
+ st.exception(e)
42
 
43
+ # Create tabs for different views
44
+ tabs = st.tabs(["📊 Consolidated Analysis", "🔍 Individual Queries"])
45
+
46
+ # Display consolidated analysis
47
+ with tabs[0]:
48
+ if 'consolidated_analysis' in st.session_state:
49
+ # Display the consolidated analysis with visualizations
50
+ display_consolidated_analysis(st.session_state.consolidated_analysis)
51
+
52
+ # Add PDF Export button for consolidated report
53
+ if st.button("📄 Export Consolidated Report as PDF"):
54
+ try:
55
+ pdf_path = generate_consolidated_pdf(st.session_state.consolidated_analysis, queries, gap_analyses)
56
+ with open(pdf_path, "rb") as pdf_file:
57
+ pdf_bytes = pdf_file.read()
58
+ st.download_button(
59
+ label="⬇️ Download Consolidated PDF Report",
60
+ data=pdf_bytes,
61
+ file_name="consolidated_gap_analysis_report.pdf",
62
+ mime="application/pdf"
63
+ )
64
+ # Clean up the file after offering download
65
+ os.remove(pdf_path)
66
+ except Exception as e:
67
+ st.error(f"Error generating consolidated PDF: {str(e)}")
68
+
69
+ # Display individual query analyses
70
+ with tabs[1]:
71
+ # Add PDF Export button for individual queries
72
+ if st.button("📄 Export Individual Queries as PDF"):
73
+ try:
74
+ pdf_path = generate_gap_analysis_pdf(queries, gap_analyses)
75
+ with open(pdf_path, "rb") as pdf_file:
76
+ pdf_bytes = pdf_file.read()
77
+ st.download_button(
78
+ label="⬇️ Download Individual Queries PDF",
79
+ data=pdf_bytes,
80
+ file_name="individual_queries_report.pdf",
81
+ mime="application/pdf"
82
+ )
83
+ # Clean up the file after offering download
84
+ os.remove(pdf_path)
85
+ except Exception as e:
86
+ st.error(f"Error generating PDF: {str(e)}")
87
+
88
+ # Create sub-tabs for query-based navigation
89
+ query_tabs = st.tabs([f"Query {i+1}" for i in range(len(queries))])
90
+
91
+ # Fill each tab with the corresponding query and response
92
+ for i, (tab, query) in enumerate(zip(query_tabs, queries)):
93
+ with tab:
94
+ if query in gap_analyses:
95
+ response = gap_analyses[query]
96
+ st.markdown(f"### 🔹 {query}")
97
+ st.markdown(f"➡️ {response}")
98
+ else:
99
+ st.warning(f"No analysis available for this query.")
100
+
101
+ def main():
102
+ """Main application function."""
103
+ st.title("IFRS9 Validation GAP Analysis")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  st.markdown("""
105
+ This application allows you to upload IFRS9 validation reports and generates
106
+ a comprehensive GAP analysis using advanced language models.
 
 
 
 
107
  """)
108
+
109
+ # File uploader widget
110
+ uploaded_file = st.file_uploader(
111
+ "Upload your IFRS9 validation report",
112
+ type=['txt', 'pdf', 'docx', 'doc'],
113
+ help="Upload a text, PDF, or Word document containing your IFRS9 validation report."
114
+ )
115
+
116
+ # Initialize variables
117
+ run_analysis = False
118
+ file_contents = ""
119
+
120
+ if uploaded_file is not None:
121
+ try:
122
+ # Extract text from the uploaded file based on its type
123
+ file_contents = extract_text_from_file(uploaded_file)
124
+
125
+ # Display file summary
126
+ st.subheader("Document Preview")
127
+ st.markdown(f"**File:** {uploaded_file.name}")
128
+ file_size = len(uploaded_file.getvalue()) / 1024
129
+ st.markdown(f"**Size:** {file_size:.2f} KB")
130
+ st.markdown(f"**File Type:** {uploaded_file.name.split('.')[-1].upper()}")
131
+
132
+ # Show preview of the extracted text
133
+ with st.expander("Preview document content", expanded=False):
134
+ preview_text = file_contents[:5000] + "..." if len(file_contents) > 5000 else file_contents
135
+ st.text_area("Document content (preview)", preview_text, height=200)
136
+
137
+ # Process button
138
+ if st.button("Run GAP Analysis", type="primary"):
139
+ run_analysis = True
140
+
141
+ except Exception as e:
142
+ st.error(f" Error processing file: {str(e)}")
143
+ st.exception(e)
144
+
145
+ # Run the analysis when requested
146
+ if run_analysis and file_contents:
147
+ with st.spinner("Running GAP analysis... This may take several minutes depending on document length."):
148
+ try:
149
+ # Clear previous results to avoid duplication
150
+ if 'results' in st.session_state:
151
+ del st.session_state.results
152
+ if 'consolidated_analysis' in st.session_state:
153
+ del st.session_state.consolidated_analysis
154
+
155
+ # Run the GAP analysis pipeline with the uploaded document
156
+ results = run_gap_analysis_pipeline(file_contents)
157
+
158
+ # Convert results to dict for session state storage
159
+ st.session_state.results = {
160
+ 'queries': results["queries"],
161
+ 'gap_analyses': results["gap_analyses"]
162
+ }
163
+
164
+ except Exception as e:
165
+ st.error(f"❌ Error during GAP analysis: {str(e)}")
166
+ st.exception(e)
167
+
168
+ # Display results if they exist in session state
169
+ if 'results' in st.session_state:
170
+ display_results(st.session_state.results)
171
+
172
+ # Application info
173
+ st.sidebar.header("About")
174
+ st.sidebar.markdown("""
175
+ This application performs GAP analysis on IFRS9 validation reports using
176
+ LangChain and LangGraph with GPT-4o.
177
+
178
+ The analysis includes:
179
+ - Generating critical GAP analysis questions
180
+ - Retrieving relevant information
181
+ - Filtering pertinent documents
182
+ - Performing detailed GAP analysis
183
  """)
184
+
185
+ # API key check (status only, not the actual key)
186
+ api_key_status = "✅ Available" if os.environ.get("OPENAI_API_KEY") else "❌ Missing"
187
+ st.sidebar.markdown(f"**OpenAI API Key:** {api_key_status}")
188
 
189
+ if __name__ == "__main__":
190
+ main()