1use crate::token_cleaner::{clean_and_redact, count_tokens};
4use anyhow::{anyhow, Result};
5use regex::Regex;
6use std::collections::HashSet;
7use std::path::{Component, Path, PathBuf};
8use std::process::Command;
9use tokio::fs;
10
11#[cfg(all(target_os = "macos", feature = "embeddings"))]
14const OCR_FILE_TYPES: &[&str] = &[
15 ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff", ".pdf",
16];
17
18#[cfg(all(target_os = "macos", feature = "embeddings"))]
20const DEFAULT_FILE_TYPE_EXCLUSIONS: &[&str] = &[
21 ".svg", ".ico", ".ttf", ".woff", ".woff2", ".eot", ".otf", ".lock", ".lockb", ".exe", ".dll",
22 ".so", ".dylib", ".bin", ".dat", ".pyc", ".pyo", ".class", ".jar", ".zip", ".tar", ".gz",
23 ".rar", ".7z", ".mp3", ".mp4", ".avi", ".mov", ".wav", ".db", ".sqlite", ".sqlite3",
24];
25
26#[cfg(any(not(target_os = "macos"), not(feature = "embeddings")))]
27const DEFAULT_FILE_TYPE_EXCLUSIONS: &[&str] = &[
28 ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff", ".pdf", ".svg", ".ico", ".ttf",
29 ".woff", ".woff2", ".eot", ".otf", ".lock", ".lockb", ".exe", ".dll", ".so", ".dylib", ".bin",
30 ".dat", ".pyc", ".pyo", ".class", ".jar", ".zip", ".tar", ".gz", ".rar", ".7z", ".mp3", ".mp4",
31 ".avi", ".mov", ".wav", ".db", ".sqlite", ".sqlite3",
32];
33
34const DEFAULT_FILE_EXCLUSIONS: &[&str] = &[
36 "**/.*rc",
37 "**/.*rc.{js,json,yaml,yml}",
38 "**/*.config.{js,ts}",
39 "**/tsconfig.json",
40 "**/tsconfig*.json",
41 "**/jsconfig.json",
42 "**/jsconfig*.json",
43 "**/package-lock.json",
44 "**/.prettierignore",
45 "**/.dockerignore",
46 "**/.env*",
47 "**/*.vars",
48 "**/secrets.*",
49 "**/.git*",
50 "**/.hg*",
51 "**/.svn*",
52 "**/CVS",
53 "**/.github/",
54 "**/.gitlab-ci.yml",
55 "**/azure-pipelines.yml",
56 "**/jenkins*",
57 "**/node_modules/",
58 "**/target/",
59 "**/__pycache__/",
60 "**/venv/",
61 "**/.venv/",
62 "**/env/",
63 "**/build/",
64 "**/dist/",
65 "**/out/",
66 "**/bin/",
67 "**/obj/",
68 "**/README*",
69 "**/CHANGELOG*",
70 "**/CONTRIBUTING*",
71 "**/LICENSE*",
72 "**/docs/",
73 "**/documentation/",
74 "**/.idea/",
75 "**/.vscode/",
76 "**/.eclipse/",
77 "**/.settings/",
78 "**/.zed/",
79 "**/.cursor/",
80 "**/.project",
81 "**/.classpath",
82 "**/.factorypath",
83 "**/test{s,}/",
84 "**/spec/",
85 "**/fixtures/",
86 "**/testdata/",
87 "**/__tests__/",
88 "**/*.{test,spec}.*",
89 "**/coverage/",
90 "**/jest.config.*",
91 "**/logs/",
92 "**/tmp/",
93 "**/temp/",
94 "**/*.log",
95];
96
97pub struct MarkdownGeneratorOptions {
99 pub dir: PathBuf,
100 pub output_file_path: PathBuf,
101 pub file_type_exclusions: HashSet<String>,
102 pub file_exclusions: Vec<String>,
103 pub verbose: bool,
104}
105
106impl Default for MarkdownGeneratorOptions {
107 fn default() -> Self {
108 Self {
109 dir: PathBuf::from("."),
110 output_file_path: PathBuf::from("prompt.md"),
111 file_type_exclusions: DEFAULT_FILE_TYPE_EXCLUSIONS
112 .iter()
113 .map(|s| s.to_string())
114 .collect(),
115 file_exclusions: DEFAULT_FILE_EXCLUSIONS
116 .iter()
117 .map(|s| s.to_string())
118 .collect(),
119 verbose: true,
120 }
121 }
122}
123
124pub struct MarkdownGenerator {
126 options: MarkdownGeneratorOptions,
127 file_exclusions: Vec<String>,
128 initialized: bool,
129 behavior: MarkdownGeneratorBehavior,
130}
131
132#[derive(Debug, Clone, Copy)]
133struct MarkdownGeneratorBehavior {
134 include_todo: bool,
135 create_todo_file: bool,
136 update_gitignore: bool,
137 include_embeddings_artifact: bool,
138}
139
140impl MarkdownGeneratorBehavior {
141 fn programmatic() -> Self {
142 Self {
143 include_todo: true,
144 create_todo_file: false,
145 update_gitignore: false,
146 include_embeddings_artifact: false,
147 }
148 }
149
150 fn cli(generate_embeddings: bool) -> Self {
151 Self {
152 include_todo: true,
153 create_todo_file: true,
154 update_gitignore: true,
155 include_embeddings_artifact: generate_embeddings,
156 }
157 }
158}
159
160impl MarkdownGenerator {
161 pub fn new(options: MarkdownGeneratorOptions) -> Self {
166 Self::with_behavior(options, MarkdownGeneratorBehavior::programmatic())
167 }
168
169 pub fn new_for_cli(options: MarkdownGeneratorOptions, generate_embeddings: bool) -> Self {
175 Self::with_behavior(options, MarkdownGeneratorBehavior::cli(generate_embeddings))
176 }
177
178 fn with_behavior(options: MarkdownGeneratorOptions, behavior: MarkdownGeneratorBehavior) -> Self {
179 let mut file_exclusions = options.file_exclusions.clone();
180 Self::add_generated_file_exclusions(&options, behavior, &mut file_exclusions);
181
182 Self {
183 file_exclusions,
184 options,
185 initialized: false,
186 behavior,
187 }
188 }
189
190 fn add_generated_file_exclusions(
191 options: &MarkdownGeneratorOptions,
192 behavior: MarkdownGeneratorBehavior,
193 file_exclusions: &mut Vec<String>,
194 ) {
195 if let Some(output_path) =
196 Self::repo_relative_path(&options.dir, &options.output_file_path)
197 {
198 Self::push_unique(file_exclusions, output_path);
199 }
200
201 if behavior.include_todo {
202 Self::push_unique(file_exclusions, String::from("todo"));
203 }
204
205 if behavior.include_embeddings_artifact {
206 Self::push_unique(file_exclusions, String::from("embeddings.json"));
207 }
208 }
209
210 fn push_unique(values: &mut Vec<String>, value: String) {
211 if !value.is_empty() && !values.iter().any(|existing| existing == &value) {
212 values.push(value);
213 }
214 }
215
216 fn repo_relative_path(repo_dir: &Path, path: &Path) -> Option<String> {
217 let relative_path = if path.is_absolute() {
218 if let Ok(path) = path.strip_prefix(repo_dir) {
219 path
220 } else {
221 let repo_dir = if repo_dir.is_absolute() {
222 repo_dir.to_path_buf()
223 } else {
224 std::env::current_dir().ok()?.join(repo_dir)
225 };
226 let repo_dir = repo_dir.canonicalize().unwrap_or(repo_dir);
227 path.strip_prefix(repo_dir).ok()?
228 }
229 } else {
230 path
231 };
232
233 let mut parts = Vec::new();
234 for component in relative_path.components() {
235 match component {
236 Component::CurDir => {}
237 Component::Normal(part) => parts.push(part.to_string_lossy().into_owned()),
238 _ => return None,
239 }
240 }
241
242 if parts.is_empty() {
243 None
244 } else {
245 Some(parts.join("/"))
246 }
247 }
248
249 async fn load_nested_ignore_files(&mut self) -> Result<()> {
251 if self.options.verbose {
252 println!("Loading ignore patterns...");
253 }
254
255 let mut ignore_files = Vec::new();
257 self.find_ignore_files(&self.options.dir, &mut ignore_files)?;
258
259 if self.options.verbose {
260 println!("Found {} ignore files", ignore_files.len());
261 }
262
263 for ignore_file in ignore_files {
265 if let Ok(content) = fs::read_to_string(&ignore_file).await {
266 let patterns: Vec<String> = content
267 .lines()
268 .map(|line| line.trim())
269 .filter(|line| !line.is_empty() && !line.starts_with('#'))
270 .map(|s| s.to_string())
271 .collect();
272
273 if let Ok(ignore_dir) = ignore_file
275 .parent()
276 .unwrap_or_else(|| Path::new("."))
277 .to_path_buf()
278 .strip_prefix(&self.options.dir)
279 {
280 for pattern in patterns {
281 let relative_pattern = if ignore_dir.as_os_str().is_empty()
282 || pattern.starts_with('/')
283 || pattern.starts_with("**")
284 {
285 pattern
286 } else {
287 format!("{}/{}", ignore_dir.display(), pattern)
288 };
289 self.file_exclusions.push(relative_pattern);
290 }
291 }
292 }
293 }
294
295 self.file_exclusions.sort();
297 self.file_exclusions.dedup();
298
299 if self.options.verbose {
300 println!("Total exclusion patterns: {}", self.file_exclusions.len());
301 }
302
303 Ok(())
304 }
305
306 fn find_ignore_files(&self, dir: &Path, results: &mut Vec<PathBuf>) -> Result<()> {
307 use walkdir::WalkDir;
308
309 for entry in WalkDir::new(dir).into_iter().filter_map(|e| e.ok()) {
310 if entry.file_name() == ".aiignore" {
311 results.push(entry.path().to_path_buf());
312 }
313 }
314 Ok(())
315 }
316
317 async fn initialize(&mut self) -> Result<()> {
319 if !self.initialized {
320 self.load_nested_ignore_files().await?;
321 self.initialized = true;
322 }
323 Ok(())
324 }
325
326 async fn get_tracked_files(&mut self) -> Result<Vec<String>> {
328 self.initialize().await?;
329
330 let output = Command::new("git")
332 .arg("ls-files")
333 .current_dir(&self.options.dir)
334 .output()
335 .map_err(|e| anyhow!("Failed to execute git ls-files: {}", e))?;
336
337 if !output.status.success() {
338 return Err(anyhow!("git ls-files failed"));
339 }
340
341 let output_str = String::from_utf8(output.stdout)
342 .map_err(|e| anyhow!("Failed to decode git output: {}", e))?;
343
344 let tracked_files: Vec<String> = output_str
345 .lines()
346 .filter(|line| !line.trim().is_empty())
347 .map(|s| s.to_string())
348 .collect();
349
350 if self.options.verbose {
351 println!("Total tracked files: {}", tracked_files.len());
352 }
353
354 let total_files = tracked_files.len();
355
356 let filtered_files = tracked_files
358 .into_iter()
359 .filter(|file| {
360 let path = Path::new(file);
361 let ext = path
362 .extension()
363 .and_then(|e| e.to_str())
364 .map(|e| format!(".{}", e))
365 .unwrap_or_default();
366
367 if self.options.file_type_exclusions.contains(&ext) {
369 return false;
370 }
371
372 !self.matches_exclusion_patterns(file)
374 })
375 .collect::<Vec<_>>();
376
377 if self.options.verbose {
378 println!("Excluded files: {}", total_files - filtered_files.len());
379 println!(
380 "Files to process after exclusions: {}",
381 filtered_files.len()
382 );
383 }
384
385 Ok(filtered_files)
386 }
387
388 fn matches_exclusion_patterns(&self, file: &str) -> bool {
390 for pattern in &self.file_exclusions {
391 if self.glob_match(pattern, file) {
392 return true;
393 }
394 }
395 false
396 }
397
398 fn glob_match(&self, pattern: &str, path: &str) -> bool {
400 let pattern = pattern
401 .replace("**", ".*")
402 .replace("*", "[^/]*")
403 .replace("?", "[^/]");
404 let pattern = format!("^{}$", pattern);
405
406 if let Ok(re) = Regex::new(&pattern) {
407 re.is_match(path)
408 } else {
409 false
410 }
411 }
412
413 #[cfg(all(target_os = "macos", feature = "embeddings"))]
415 fn is_ocr_file(ext: &str) -> bool {
416 OCR_FILE_TYPES.contains(&ext)
417 }
418
419 async fn read_file_content(&self, file_path: &Path) -> Result<String> {
421 #[cfg(all(target_os = "macos", feature = "embeddings"))]
422 {
423 let ext = file_path
424 .extension()
425 .and_then(|e| e.to_str())
426 .map(|e| format!(".{}", e.to_lowercase()))
427 .unwrap_or_default();
428
429 if Self::is_ocr_file(&ext) {
430 return self.read_file_content_ocr(file_path).await;
431 }
432 }
433
434 let content = fs::read_to_string(file_path).await?;
435 let cleaned = clean_and_redact(&content);
436
437 if self.options.verbose && !cleaned.is_empty() {
438 let token_count = count_tokens(&cleaned);
439 println!("{}: Tokens[{}]", file_path.display(), token_count);
440 }
441
442 Ok(cleaned.trim_end().to_string())
443 }
444
445 #[cfg(all(target_os = "macos", feature = "embeddings"))]
447 async fn read_file_content_ocr(&self, file_path: &Path) -> Result<String> {
448 use toak_ocr::{AppleOcrEngine, OcrEngine, OcrInput};
449
450 let engine = AppleOcrEngine::new();
451 let input = OcrInput::FilePath(file_path.to_path_buf());
452 let output = engine
453 .recognize(&input)
454 .await
455 .map_err(|e| anyhow!("OCR failed for {}: {}", file_path.display(), e))?;
456
457 if self.options.verbose && !output.text.is_empty() {
458 let token_count = count_tokens(&output.text);
459 println!("{}: Tokens[{}] (OCR)", file_path.display(), token_count);
460 }
461
462 Ok(output.text.trim_end().to_string())
463 }
464
465 async fn generate_markdown(&mut self) -> Result<String> {
467 let tracked_files = self.get_tracked_files().await?;
468
469 if self.options.verbose {
470 println!("Generating markdown for {} files", tracked_files.len());
471 }
472
473 let mut markdown = String::from("# Project Files\n\n");
474
475 for file in tracked_files {
476 let absolute_path = self.options.dir.join(&file);
477 match self.read_file_content(&absolute_path).await {
478 Ok(content) => {
479 if !content.trim().is_empty() {
480 markdown.push_str(&format!("## {}\n~~~\n{}\n~~~\n\n", file, content.trim()));
481 } else if self.options.verbose {
482 println!("Skipping {} as it has no content after cleaning.", file);
483 }
484 }
485 Err(e) => {
486 if self.options.verbose {
487 eprintln!("Error reading file {}: {}", file, e);
488 }
489 }
490 }
491 }
492
493 Ok(markdown)
494 }
495
496 async fn get_todo(&self) -> Result<Option<String>> {
498 let todo_path = self.options.dir.join("todo");
499
500 if self.options.verbose {
501 println!("Reading todo file");
502 }
503
504 match fs::read_to_string(&todo_path).await {
505 Ok(content) => Ok(Some(content)),
506 Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
507 if !self.behavior.create_todo_file {
508 return Ok(None);
509 }
510
511 if self.options.verbose {
512 println!("File not found, creating a new 'todo' file.");
513 }
514 fs::write(&todo_path, "").await?;
515 Ok(Some(String::new()))
516 }
517 Err(e) => Err(anyhow!("Error reading todo file: {}", e)),
518 }
519 }
520
521 fn gitignore_entries(&self) -> Vec<String> {
522 let mut entries = Vec::new();
523
524 if let Some(output_path) =
525 Self::repo_relative_path(&self.options.dir, &self.options.output_file_path)
526 {
527 Self::push_unique(&mut entries, output_path);
528 }
529
530 if self.behavior.include_todo {
531 Self::push_unique(&mut entries, String::from("todo"));
532 }
533
534 if self.behavior.include_embeddings_artifact {
535 Self::push_unique(&mut entries, String::from("embeddings.json"));
536 }
537
538 entries
539 }
540
541 async fn update_gitignore(&self) -> Result<()> {
543 let entries = self.gitignore_entries();
544 if entries.is_empty() {
545 return Ok(());
546 }
547
548 let gitignore_path = self.options.dir.join(".gitignore");
549
550 let content = match fs::read_to_string(&gitignore_path).await {
551 Ok(c) => c,
552 Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
553 if self.options.verbose {
554 println!("File not found, creating a '.gitignore' file.");
555 }
556 String::new()
557 }
558 Err(e) => return Err(anyhow!("Error reading .gitignore: {}", e)),
559 };
560
561 let lines: Vec<&str> = content.lines().map(|l| l.trim()).collect();
562 let missing_entries = entries
563 .into_iter()
564 .filter(|entry| !lines.contains(&entry.as_str()))
565 .collect::<Vec<_>>();
566
567 if !missing_entries.is_empty() {
568 if self.options.verbose {
569 println!("Updating .gitignore with generated files");
570 }
571
572 let mut new_content = content;
573 if !new_content.is_empty() && !new_content.ends_with('\n') {
574 new_content.push('\n');
575 }
576
577 for entry in missing_entries {
578 new_content.push_str(&entry);
579 new_content.push('\n');
580 }
581
582 fs::write(&gitignore_path, new_content).await?;
583 }
584
585 Ok(())
586 }
587
588 pub async fn create_markdown_document(&mut self) -> Result<MarkdownResult> {
590 let code_markdown = self.generate_markdown().await?;
591 let markdown = if self.behavior.include_todo {
592 match self.get_todo().await? {
593 Some(todos) => format!("{}\n---\n\n{}\n", code_markdown, todos),
594 None => code_markdown,
595 }
596 } else {
597 code_markdown
598 };
599
600 if self.behavior.update_gitignore {
601 self.update_gitignore().await?;
602 }
603
604 let token_count = count_tokens(&markdown);
605
606 if self.options.verbose {
607 println!(
608 "Markdown document created at {}",
609 self.options.output_file_path.display()
610 );
611 println!("{{ \"total_tokens\": {} }}", token_count);
612 }
613
614 fs::write(&self.options.output_file_path, &markdown).await?;
615
616 Ok(MarkdownResult {
617 success: true,
618 token_count: Some(token_count),
619 error: None,
620 })
621 }
622}
623
624#[derive(Debug, Clone)]
626#[allow(dead_code)]
627pub struct MarkdownResult {
628 pub success: bool,
629 pub token_count: Option<usize>,
630 pub error: Option<String>,
631}