1use crate::token_cleaner::count_tokens;
3
4#[derive(Clone)]
6pub struct ChunkerConfig {
7 pub chunk_size: usize,
9 pub overlap_size: usize,
11}
12
13impl Default for ChunkerConfig {
14 fn default() -> Self {
15 Self {
16 chunk_size: 800,
17 overlap_size: 100,
18 }
19 }
20}
21
22#[derive(Debug, Clone)]
24pub struct TextChunk {
25 pub content: String,
26 pub start_index: usize,
27 pub end_index: usize,
28 pub chunk_index: usize,
29}
30
31pub fn chunk_text(text: &str, config: &ChunkerConfig) -> Vec<TextChunk> {
33 if text.trim().is_empty() {
34 return vec![];
35 }
36
37 let total_tokens = count_tokens(text);
39 if total_tokens <= config.chunk_size {
40 return vec![TextChunk {
41 content: text.to_string(),
42 start_index: 0,
43 end_index: text.len(),
44 chunk_index: 0,
45 }];
46 }
47
48 let mut chunks = Vec::new();
49 let lines: Vec<&str> = text.lines().collect();
50 let mut current_chunk = String::new();
51 let mut current_tokens = 0;
52 let mut start_line = 0;
53 let mut chunk_index = 0;
54 let mut overlap_buffer = String::new();
55
56 for (line_idx, line) in lines.iter().enumerate() {
57 let line_with_newline = format!("{}\n", line);
58 let line_tokens = count_tokens(&line_with_newline);
59
60 if line_tokens > config.chunk_size {
62 if !current_chunk.is_empty() {
64 chunks.push(TextChunk {
65 content: current_chunk.clone(),
66 start_index: start_line,
67 end_index: line_idx,
68 chunk_index,
69 });
70 chunk_index += 1;
71 }
72
73 let char_chunks = split_large_line(line, config);
75 for char_chunk in char_chunks {
76 chunks.push(TextChunk {
77 content: char_chunk,
78 start_index: line_idx,
79 end_index: line_idx + 1,
80 chunk_index,
81 });
82 chunk_index += 1;
83 }
84
85 current_chunk.clear();
87 current_tokens = 0;
88 start_line = line_idx + 1;
89 overlap_buffer.clear();
90 continue;
91 }
92
93 if current_tokens + line_tokens > config.chunk_size && !current_chunk.is_empty() {
95 chunks.push(TextChunk {
97 content: current_chunk.clone(),
98 start_index: start_line,
99 end_index: line_idx,
100 chunk_index,
101 });
102 chunk_index += 1;
103
104 current_chunk = overlap_buffer.clone();
106 current_tokens = count_tokens(¤t_chunk);
107 start_line = line_idx;
108 }
109
110 current_chunk.push_str(&line_with_newline);
112 current_tokens += line_tokens;
113
114 overlap_buffer.push_str(&line_with_newline);
116 let overlap_tokens = count_tokens(&overlap_buffer);
117
118 if overlap_tokens > config.overlap_size {
120 let overlap_lines: Vec<&str> = overlap_buffer.lines().collect();
121 let mut new_overlap = String::new();
122 let mut overlap_tok = 0;
123
124 for ol in overlap_lines.iter().rev() {
125 let ol_with_newline = format!("{}\n", ol);
126 let ol_tokens = count_tokens(&ol_with_newline);
127
128 if overlap_tok + ol_tokens > config.overlap_size {
129 break;
130 }
131
132 new_overlap = format!("{}{}", ol_with_newline, new_overlap);
133 overlap_tok += ol_tokens;
134 }
135
136 overlap_buffer = new_overlap;
137 }
138 }
139
140 if !current_chunk.is_empty() {
142 chunks.push(TextChunk {
143 content: current_chunk,
144 start_index: start_line,
145 end_index: lines.len(),
146 chunk_index,
147 });
148 }
149
150 chunks
151}
152
153fn split_large_line(line: &str, config: &ChunkerConfig) -> Vec<String> {
155 let mut result = Vec::new();
156 let chars: Vec<char> = line.chars().collect();
157
158 let chars_per_chunk = config.chunk_size * 4;
160
161 let mut start = 0;
162 while start < chars.len() {
163 let end = (start + chars_per_chunk).min(chars.len());
164 let chunk: String = chars[start..end].iter().collect();
165
166 if count_tokens(&chunk) <= config.chunk_size || result.is_empty() {
168 result.push(chunk);
169 start = end;
170 } else {
171 let reduced_end = start + (chars_per_chunk / 2).max(1);
173 let chunk: String = chars[start..reduced_end].iter().collect();
174 result.push(chunk);
175 start = reduced_end;
176 }
177 }
178
179 result
180}
181
182#[cfg(test)]
183mod tests {
184 use super::*;
185
186 #[test]
187 fn test_empty_text() {
188 let config = ChunkerConfig::default();
189 let chunks = chunk_text("", &config);
190 assert_eq!(chunks.len(), 0);
191 }
192
193 #[test]
194 fn test_small_text() {
195 let config = ChunkerConfig::default();
196 let text = "Hello, world!";
197 let chunks = chunk_text(text, &config);
198 assert_eq!(chunks.len(), 1);
199 assert_eq!(chunks[0].content, text);
200 }
201
202 #[test]
203 fn test_chunking_with_overlap() {
204 let config = ChunkerConfig {
205 chunk_size: 50,
206 overlap_size: 10,
207 };
208 let text = (0..100).map(|i| format!("Line {}", i)).collect::<Vec<_>>().join("\n");
209 let chunks = chunk_text(&text, &config);
210
211 assert!(chunks.len() > 1);
213
214 for (i, chunk) in chunks.iter().enumerate() {
216 assert_eq!(chunk.chunk_index, i);
217 }
218 }
219}