1use regex::RegexBuilder;
3use std::sync::OnceLock;
4
5static CLEANING_PATTERNS: OnceLock<Vec<(&'static str, &'static str)>> = OnceLock::new();
7
8static SECRET_PATTERNS: OnceLock<Vec<(&'static str, &'static str)>> = OnceLock::new();
10
11fn get_cleaning_patterns() -> &'static Vec<(&'static str, &'static str)> {
12 CLEANING_PATTERNS.get_or_init(|| {
13 vec![
14 (r"//.*?$", ""), (r"/\*[\s\S]*?\*/", ""), (r"console\.(log|error|warn|info)\([^)]*\);?", ""), (r"^\s*[\r\n]", ""), (r" +$", ""), (r"^\s*import\s+.*?;?\s*$", ""), (r"^\s*\n+", "\n"), ]
22 })
23}
24
25fn get_secret_patterns() -> &'static Vec<(&'static str, &'static str)> {
26 SECRET_PATTERNS.get_or_init(|| {
27 vec![
28 (r#"((?:api|stripe|access|auth|client|secret|private|jwt)[_-]?(?:key|secret|token))\s*=\s*["']([^"']+)["']"#, "$1=[REDACTED]"),
30 (r#"^(API[_-]?KEY|API[_-]?SECRET|ACCESS[_-]?TOKEN|AUTH[_-]?TOKEN|CLIENT[_-]?SECRET|DB[_-]?PASSWORD|DATABASE[_-]?PASSWORD|AWS_ACCESS_KEY_ID|AWS_SECRET_ACCESS_KEY|GOOGLE_API_KEY|AZURE_CLIENT_SECRET|DATABASE_URL|MONGO_URI|MYSQL_URL|JWT[_-]?SECRET|SECRET[_-]?KEY|PRIVATE[_-]?KEY)\s*=\s*(?:"[^"]*"|'[^']*'|[^\s#\n]*)"#, "$1=[REDACTED]"),
32 (r"bearer\s+[a-zA-Z0-9\-._~+\/=]+", "bearer [REDACTED]"),
34 (r"eyJ[A-Za-z0-9_\-=]+\.[A-Za-z0-9_\-=]+\.[A-Za-z0-9_\-=.]+", "[REDACTED_JWT]"),
36 (r"\b[a-f0-9]{40}\b", "[REDACTED_HASH]"),
38 (r"\b[a-f0-9]{64}\b", "[REDACTED_HASH]"),
39 (r#"["']([A-Za-z0-9+/]{40,}={0,2})["']"#, "[REDACTED_BASE64]"),
41 ]
42 })
43}
44
45pub fn count_tokens(text: &str) -> usize {
49 text.split_whitespace().count()
50}
51
52pub fn clean_code(code: &str) -> String {
54 let mut result = code.to_string();
55
56 for (pattern_str, replacement) in get_cleaning_patterns() {
57 if let Ok(re) = RegexBuilder::new(pattern_str)
59 .multi_line(true)
60 .dot_matches_new_line(true)
61 .build()
62 {
63 result = re.replace_all(&result, *replacement).to_string();
64 }
65 }
66
67 result.trim().to_string()
68}
69
70pub fn redact_secrets(code: &str) -> String {
72 let mut result = code.to_string();
73
74 for (pattern_str, replacement) in get_secret_patterns() {
75 if let Ok(re) = RegexBuilder::new(pattern_str)
76 .multi_line(true)
77 .case_insensitive(true)
78 .build()
79 {
80 result = re.replace_all(&result, *replacement).to_string();
81 }
82 }
83
84 result
85}
86
87fn remove_redacted_lines(code: &str) -> String {
89 code
90 .lines()
91 .filter(|line| !line.contains("[REDACTED"))
92 .collect::<Vec<_>>()
93 .join("\n")
94}
95
96pub fn clean_and_redact(code: &str) -> String {
101 let redacted = redact_secrets(code);
102 let without_redacted_lines = remove_redacted_lines(&redacted);
103 let cleaned = clean_code(&without_redacted_lines);
104 cleaned.trim().to_string()
105}
106
107#[cfg(test)]
108mod tests {
109 use super::*;
110
111 #[test]
112 fn test_count_tokens() {
113 assert_eq!(count_tokens("hello world"), 2);
114 assert_eq!(count_tokens("one two three four"), 4);
115 }
116
117 #[test]
118 fn test_clean_comments() {
119 let code = "let x = 1; // this is a comment\nlet y = 2;";
120 let cleaned = clean_code(code);
121 assert!(!cleaned.contains("comment"), "Result: {}", cleaned);
123 assert!(cleaned.contains("let x"), "Result: {}", cleaned);
124 }
125
126 #[test]
127 fn test_redact_api_key() {
128 let code = r#"const API_KEY="sk-1234567890abcdef""#;
129 let redacted = redact_secrets(code);
130 assert!(redacted.contains("[REDACTED]"), "Result: {}", redacted);
131 }
132
133 #[test]
134 fn test_clean_and_redact() {
135 let code = r#"
136 // API endpoint
137 const API_KEY = "secret-key-123";
138 console.log("test");
139 "#;
140 let result = clean_and_redact(code);
141 assert!(
143 !result.contains("//"),
144 "Comments should be removed. Result: {}",
145 result
146 );
147 }
148}