Kokush1bo commited on
Commit
d58ed16
·
verified ·
1 Parent(s): dbe2e04

Upload pattern_matching.py

Browse files
Files changed (1) hide show
  1. pattern_matching.py +127 -0
pattern_matching.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List, Dict, Union
3
+ from collections import defaultdict
4
+
5
+ # Enhanced profanity word list with variations
6
+ PROFANE_WORDS = [
7
+ # Base words
8
+ r'damn', r'hell', r'shit', r'fuck', r'bastard', r'asshole', r'crap',
9
+ r'dick', r'piss', r'slut', r'bitch', r'motherfucker', r'nigger',
10
+
11
+ # Common variations and misspellings
12
+ r'f\*ck', r'f\*\*k', r'sh\*t', r'b\*tch', r'a\*hole', r'd\*mn',
13
+ r'f\*\*\*', r's\*\*t', r'f\*\*', r'f\*\*\*ing', r'f\*\*king',
14
+ r'fuk', r'shitty', r'fucking', r'fcked', r'fcker'
15
+ ]
16
+
17
+ # Compile more comprehensive regex pattern
18
+ PROFANITY_REGEX = re.compile(
19
+ r'(?:^|\b)(?:' + '|'.join(PROFANE_WORDS) + r')(?:$|\b)',
20
+ re.IGNORECASE
21
+ )
22
+
23
+ def pattern_detect_profanity(conversation: List[Dict[str, Union[str, int]]]) -> Dict[str, List[Dict]]:
24
+ """
25
+ Enhanced profanity detection with better pattern matching.
26
+
27
+ Args:
28
+ conversation: List of utterance dictionaries with 'speaker', 'text', etc.
29
+
30
+ Returns:
31
+ Dictionary with 'Agent' and 'Borrower' keys containing flagged utterances.
32
+ """
33
+ result = defaultdict(list)
34
+
35
+ for utterance in conversation:
36
+ speaker = utterance.get("speaker", "").strip().lower()
37
+ text = utterance.get("text", "")
38
+
39
+ if not text:
40
+ continue
41
+
42
+ # Find all profanity matches
43
+ matches = PROFANITY_REGEX.finditer(text)
44
+ if matches:
45
+ # Add match information to the utterance
46
+ matched_words = [m.group() for m in matches]
47
+ flagged_utterance = utterance.copy()
48
+ flagged_utterance["matched_words"] = matched_words
49
+
50
+ # Categorize by speaker
51
+ if "agent" in speaker:
52
+ result["Agent"].append(flagged_utterance)
53
+ elif "borrower" in speaker:
54
+ result["Borrower"].append(flagged_utterance)
55
+
56
+ return dict(result)
57
+
58
+
59
+ # Enhanced sensitive info and verification patterns
60
+ SENSITIVE_PATTERNS = [
61
+ r'\baccount\s*(?:number|#)?\s*[:=]?\s*\d+', # Account numbers
62
+ r'\b(?:balance|amount due|outstanding|payment due)\s*[:=]?\s*\$\d+', # Monetary amounts
63
+ r'\b(?:credit|debit)\s*card\s*(?:number|#)?\s*[:=]?\s*\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}', # Card numbers
64
+ r'\b(?:ssn|social security)\s*(?:number|#)?\s*[:=]?\s*\d{3}[-\s]?\d{2}[-\s]?\d{4}' # SSN
65
+ ]
66
+
67
+ VERIFICATION_PATTERNS = [
68
+ r'\b(?:verify|verification|confirm)\b.*\b(?:identity|yourself)\b',
69
+ r'\b(?:date of birth|dob)\s*[:=]?\s*\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',
70
+ r'\b(?:address)\s*[:=]?\s*(?:\d+\s+[\w\s]+,\s*[\w\s]+,\s*[A-Z]{2}\s*\d{5})',
71
+ r'\b(?:social security|ssn)\s*(?:number|#)?\s*[:=]?\s*\d{3}[-\s]?\d{2}[-\s]?\d{4}',
72
+ r'\b(?:last\s*4\s*digits\s*of\s*ssn)\s*[:=]?\s*\d{4}'
73
+ ]
74
+
75
+ # Compile patterns for better performance
76
+ SENSITIVE_REGEX = [re.compile(pattern, re.IGNORECASE) for pattern in SENSITIVE_PATTERNS]
77
+ VERIFICATION_REGEX = [re.compile(pattern, re.IGNORECASE) for pattern in VERIFICATION_PATTERNS]
78
+
79
+ def pattern_detect_compliance_violation(conversation: List[Dict[str, Union[str, int]]]) -> List[Dict]:
80
+ """
81
+ Enhanced compliance violation detection with better pattern matching.
82
+
83
+ Args:
84
+ conversation: List of utterance dictionaries with 'speaker', 'text', etc.
85
+
86
+ Returns:
87
+ List of violating utterances with detected sensitive info and verification status.
88
+ """
89
+ violations = []
90
+ verified = False
91
+ verification_attempted = False
92
+
93
+ for utterance in conversation:
94
+ speaker = utterance.get("speaker", "").strip().lower()
95
+ text = utterance.get("text", "")
96
+
97
+ if speaker != "agent" or not text:
98
+ continue
99
+
100
+ # Check for verification attempts
101
+ if not verified:
102
+ for pattern in VERIFICATION_REGEX:
103
+ if pattern.search(text):
104
+ verification_attempted = True
105
+ # Check if verification was successful (simplified)
106
+ if "correct" in text.lower() or "match" in text.lower():
107
+ verified = True
108
+ break
109
+
110
+ # Check for sensitive information
111
+ sensitive_info = []
112
+ for pattern in SENSITIVE_REGEX:
113
+ match = pattern.search(text)
114
+ if match:
115
+ sensitive_info.append(match.group())
116
+
117
+ if sensitive_info and not verified:
118
+ flagged_utterance = utterance.copy()
119
+ flagged_utterance["sensitive_info"] = sensitive_info
120
+ flagged_utterance["verification_status"] = {
121
+ "verified": verified,
122
+ "attempted": verification_attempted
123
+ }
124
+ violations.append(flagged_utterance)
125
+
126
+ return violations
127
+