Skip to main content

toak_rs/
text_chunker.rs

1//! Helpers for slicing strings into token-aware chunks for embeddings and documentation.
2use crate::token_cleaner::count_tokens;
3
4/// Configuration for text chunking
5#[derive(Clone)]
6pub struct ChunkerConfig {
7    /// Target size for each chunk in tokens
8    pub chunk_size: usize,
9    /// Number of tokens to overlap between chunks for context preservation
10    pub overlap_size: usize,
11}
12
13impl Default for ChunkerConfig {
14    fn default() -> Self {
15        Self {
16            chunk_size: 800,
17            overlap_size: 100,
18        }
19    }
20}
21
22/// Represents a chunk of text with metadata
23#[derive(Debug, Clone)]
24pub struct TextChunk {
25    pub content: String,
26    pub start_index: usize,
27    pub end_index: usize,
28    pub chunk_index: usize,
29}
30
31/// Chunks text into overlapping segments based on token count
32pub fn chunk_text(text: &str, config: &ChunkerConfig) -> Vec<TextChunk> {
33    if text.trim().is_empty() {
34        return vec![];
35    }
36
37    // If the entire text fits in one chunk, return it as-is
38    let total_tokens = count_tokens(text);
39    if total_tokens <= config.chunk_size {
40        return vec![TextChunk {
41            content: text.to_string(),
42            start_index: 0,
43            end_index: text.len(),
44            chunk_index: 0,
45        }];
46    }
47
48    let mut chunks = Vec::new();
49    let lines: Vec<&str> = text.lines().collect();
50    let mut current_chunk = String::new();
51    let mut current_tokens = 0;
52    let mut start_line = 0;
53    let mut chunk_index = 0;
54    let mut overlap_buffer = String::new();
55
56    for (line_idx, line) in lines.iter().enumerate() {
57        let line_with_newline = format!("{}\n", line);
58        let line_tokens = count_tokens(&line_with_newline);
59
60        // If a single line is too large, we need to split it by characters
61        if line_tokens > config.chunk_size {
62            // First, finish current chunk if it has content
63            if !current_chunk.is_empty() {
64                chunks.push(TextChunk {
65                    content: current_chunk.clone(),
66                    start_index: start_line,
67                    end_index: line_idx,
68                    chunk_index,
69                });
70                chunk_index += 1;
71            }
72
73            // Split the large line into character-based chunks
74            let char_chunks = split_large_line(line, config);
75            for char_chunk in char_chunks {
76                chunks.push(TextChunk {
77                    content: char_chunk,
78                    start_index: line_idx,
79                    end_index: line_idx + 1,
80                    chunk_index,
81                });
82                chunk_index += 1;
83            }
84
85            // Reset for next chunk
86            current_chunk.clear();
87            current_tokens = 0;
88            start_line = line_idx + 1;
89            overlap_buffer.clear();
90            continue;
91        }
92
93        // Check if adding this line would exceed chunk size
94        if current_tokens + line_tokens > config.chunk_size && !current_chunk.is_empty() {
95            // Save current chunk
96            chunks.push(TextChunk {
97                content: current_chunk.clone(),
98                start_index: start_line,
99                end_index: line_idx,
100                chunk_index,
101            });
102            chunk_index += 1;
103
104            // Start new chunk with overlap from previous chunk
105            current_chunk = overlap_buffer.clone();
106            current_tokens = count_tokens(&current_chunk);
107            start_line = line_idx;
108        }
109
110        // Add line to current chunk
111        current_chunk.push_str(&line_with_newline);
112        current_tokens += line_tokens;
113
114        // Update overlap buffer (keep last N tokens worth of lines)
115        overlap_buffer.push_str(&line_with_newline);
116        let overlap_tokens = count_tokens(&overlap_buffer);
117
118        // Trim overlap buffer if it's too large
119        if overlap_tokens > config.overlap_size {
120            let overlap_lines: Vec<&str> = overlap_buffer.lines().collect();
121            let mut new_overlap = String::new();
122            let mut overlap_tok = 0;
123
124            for ol in overlap_lines.iter().rev() {
125                let ol_with_newline = format!("{}\n", ol);
126                let ol_tokens = count_tokens(&ol_with_newline);
127
128                if overlap_tok + ol_tokens > config.overlap_size {
129                    break;
130                }
131
132                new_overlap = format!("{}{}", ol_with_newline, new_overlap);
133                overlap_tok += ol_tokens;
134            }
135
136            overlap_buffer = new_overlap;
137        }
138    }
139
140    // Add final chunk if there's remaining content
141    if !current_chunk.is_empty() {
142        chunks.push(TextChunk {
143            content: current_chunk,
144            start_index: start_line,
145            end_index: lines.len(),
146            chunk_index,
147        });
148    }
149
150    chunks
151}
152
153/// Splits a very large line into smaller chunks based on character count
154fn split_large_line(line: &str, config: &ChunkerConfig) -> Vec<String> {
155    let mut result = Vec::new();
156    let chars: Vec<char> = line.chars().collect();
157
158    // Estimate characters per chunk (rough approximation: 4 chars per token)
159    let chars_per_chunk = config.chunk_size * 4;
160
161    let mut start = 0;
162    while start < chars.len() {
163        let end = (start + chars_per_chunk).min(chars.len());
164        let chunk: String = chars[start..end].iter().collect();
165
166        // Verify the chunk isn't too large
167        if count_tokens(&chunk) <= config.chunk_size || result.is_empty() {
168            result.push(chunk);
169            start = end;
170        } else {
171            // If still too large, try with fewer characters
172            let reduced_end = start + (chars_per_chunk / 2).max(1);
173            let chunk: String = chars[start..reduced_end].iter().collect();
174            result.push(chunk);
175            start = reduced_end;
176        }
177    }
178
179    result
180}
181
182#[cfg(test)]
183mod tests {
184    use super::*;
185
186    #[test]
187    fn test_empty_text() {
188        let config = ChunkerConfig::default();
189        let chunks = chunk_text("", &config);
190        assert_eq!(chunks.len(), 0);
191    }
192
193    #[test]
194    fn test_small_text() {
195        let config = ChunkerConfig::default();
196        let text = "Hello, world!";
197        let chunks = chunk_text(text, &config);
198        assert_eq!(chunks.len(), 1);
199        assert_eq!(chunks[0].content, text);
200    }
201
202    #[test]
203    fn test_chunking_with_overlap() {
204        let config = ChunkerConfig {
205            chunk_size: 50,
206            overlap_size: 10,
207        };
208        let text = (0..100).map(|i| format!("Line {}", i)).collect::<Vec<_>>().join("\n");
209        let chunks = chunk_text(&text, &config);
210
211        // Should create multiple chunks
212        assert!(chunks.len() > 1);
213
214        // Verify chunk indices are sequential
215        for (i, chunk) in chunks.iter().enumerate() {
216            assert_eq!(chunk.chunk_index, i);
217        }
218    }
219}