Skip to main content

toak_rs/
markdown_generator.rs

1//! Utilities that turn a repository into a human readable markdown file, handling ignore files
2//! and ensuring the generated artifacts are tracked in `.gitignore`.
3use crate::token_cleaner::{clean_and_redact, count_tokens};
4use anyhow::{anyhow, Result};
5use regex::Regex;
6use std::collections::HashSet;
7use std::path::{Component, Path, PathBuf};
8use std::process::Command;
9use tokio::fs;
10
11/// Default file type exclusions (by extension)
12/// File types that can be processed via OCR instead of reading as text
13#[cfg(all(target_os = "macos", feature = "embeddings"))]
14const OCR_FILE_TYPES: &[&str] = &[
15  ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff", ".pdf",
16];
17
18/// Default file type exclusions (by extension)
19#[cfg(all(target_os = "macos", feature = "embeddings"))]
20const DEFAULT_FILE_TYPE_EXCLUSIONS: &[&str] = &[
21  ".svg", ".ico", ".ttf", ".woff", ".woff2", ".eot", ".otf", ".lock", ".lockb", ".exe", ".dll",
22  ".so", ".dylib", ".bin", ".dat", ".pyc", ".pyo", ".class", ".jar", ".zip", ".tar", ".gz",
23  ".rar", ".7z", ".mp3", ".mp4", ".avi", ".mov", ".wav", ".db", ".sqlite", ".sqlite3",
24];
25
26#[cfg(any(not(target_os = "macos"), not(feature = "embeddings")))]
27const DEFAULT_FILE_TYPE_EXCLUSIONS: &[&str] = &[
28  ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff", ".pdf", ".svg", ".ico", ".ttf",
29  ".woff", ".woff2", ".eot", ".otf", ".lock", ".lockb", ".exe", ".dll", ".so", ".dylib", ".bin",
30  ".dat", ".pyc", ".pyo", ".class", ".jar", ".zip", ".tar", ".gz", ".rar", ".7z", ".mp3", ".mp4",
31  ".avi", ".mov", ".wav", ".db", ".sqlite", ".sqlite3",
32];
33
34/// Default file pattern exclusions
35const DEFAULT_FILE_EXCLUSIONS: &[&str] = &[
36  "**/.*rc",
37  "**/.*rc.{js,json,yaml,yml}",
38  "**/*.config.{js,ts}",
39  "**/tsconfig.json",
40  "**/tsconfig*.json",
41  "**/jsconfig.json",
42  "**/jsconfig*.json",
43  "**/package-lock.json",
44  "**/.prettierignore",
45  "**/.dockerignore",
46  "**/.env*",
47  "**/*.vars",
48  "**/secrets.*",
49  "**/.git*",
50  "**/.hg*",
51  "**/.svn*",
52  "**/CVS",
53  "**/.github/",
54  "**/.gitlab-ci.yml",
55  "**/azure-pipelines.yml",
56  "**/jenkins*",
57  "**/node_modules/",
58  "**/target/",
59  "**/__pycache__/",
60  "**/venv/",
61  "**/.venv/",
62  "**/env/",
63  "**/build/",
64  "**/dist/",
65  "**/out/",
66  "**/bin/",
67  "**/obj/",
68  "**/README*",
69  "**/CHANGELOG*",
70  "**/CONTRIBUTING*",
71  "**/LICENSE*",
72  "**/docs/",
73  "**/documentation/",
74  "**/.idea/",
75  "**/.vscode/",
76  "**/.eclipse/",
77  "**/.settings/",
78  "**/.zed/",
79  "**/.cursor/",
80  "**/.project",
81  "**/.classpath",
82  "**/.factorypath",
83  "**/test{s,}/",
84  "**/spec/",
85  "**/fixtures/",
86  "**/testdata/",
87  "**/__tests__/",
88  "**/*.{test,spec}.*",
89  "**/coverage/",
90  "**/jest.config.*",
91  "**/logs/",
92  "**/tmp/",
93  "**/temp/",
94  "**/*.log",
95];
96
97/// Configuration that controls how markdown is generated.
98pub struct MarkdownGeneratorOptions {
99  pub dir: PathBuf,
100  pub output_file_path: PathBuf,
101  pub file_type_exclusions: HashSet<String>,
102  pub file_exclusions: Vec<String>,
103  pub verbose: bool,
104}
105
106impl Default for MarkdownGeneratorOptions {
107  fn default() -> Self {
108    Self {
109      dir: PathBuf::from("."),
110      output_file_path: PathBuf::from("prompt.md"),
111      file_type_exclusions: DEFAULT_FILE_TYPE_EXCLUSIONS
112        .iter()
113        .map(|s| s.to_string())
114        .collect(),
115      file_exclusions: DEFAULT_FILE_EXCLUSIONS
116        .iter()
117        .map(|s| s.to_string())
118        .collect(),
119      verbose: true,
120    }
121  }
122}
123
124/// Drives the markdown generation run by walking tracked files, cleaning artifacts, and aggregating text.
125pub struct MarkdownGenerator {
126  options: MarkdownGeneratorOptions,
127  file_exclusions: Vec<String>,
128  initialized: bool,
129  behavior: MarkdownGeneratorBehavior,
130}
131
132#[derive(Debug, Clone, Copy)]
133struct MarkdownGeneratorBehavior {
134  include_todo: bool,
135  create_todo_file: bool,
136  update_gitignore: bool,
137  include_embeddings_artifact: bool,
138}
139
140impl MarkdownGeneratorBehavior {
141  fn programmatic() -> Self {
142    Self {
143      include_todo: true,
144      create_todo_file: false,
145      update_gitignore: false,
146      include_embeddings_artifact: false,
147    }
148  }
149
150  fn cli(generate_embeddings: bool) -> Self {
151    Self {
152      include_todo: true,
153      create_todo_file: true,
154      update_gitignore: true,
155      include_embeddings_artifact: generate_embeddings,
156    }
157  }
158}
159
160impl MarkdownGenerator {
161  /// Creates a generator for library/programmatic use.
162  ///
163  /// This mode avoids project-management side effects: it does not create a
164  /// `todo` file and does not modify `.gitignore`.
165  pub fn new(options: MarkdownGeneratorOptions) -> Self {
166    Self::with_behavior(options, MarkdownGeneratorBehavior::programmatic())
167  }
168
169  /// Creates a generator configured for the CLI.
170  ///
171  /// CLI mode preserves Node CLI parity by including/creating `todo` and
172  /// updating `.gitignore`. Embeddings artifacts are only managed when
173  /// `generate_embeddings` is true.
174  pub fn new_for_cli(options: MarkdownGeneratorOptions, generate_embeddings: bool) -> Self {
175    Self::with_behavior(options, MarkdownGeneratorBehavior::cli(generate_embeddings))
176  }
177
178  fn with_behavior(options: MarkdownGeneratorOptions, behavior: MarkdownGeneratorBehavior) -> Self {
179    let mut file_exclusions = options.file_exclusions.clone();
180    Self::add_generated_file_exclusions(&options, behavior, &mut file_exclusions);
181
182    Self {
183      file_exclusions,
184      options,
185      initialized: false,
186      behavior,
187    }
188  }
189
190  fn add_generated_file_exclusions(
191    options: &MarkdownGeneratorOptions,
192    behavior: MarkdownGeneratorBehavior,
193    file_exclusions: &mut Vec<String>,
194  ) {
195    if let Some(output_path) =
196      Self::repo_relative_path(&options.dir, &options.output_file_path)
197    {
198      Self::push_unique(file_exclusions, output_path);
199    }
200
201    if behavior.include_todo {
202      Self::push_unique(file_exclusions, String::from("todo"));
203    }
204
205    if behavior.include_embeddings_artifact {
206      Self::push_unique(file_exclusions, String::from("embeddings.json"));
207    }
208  }
209
210  fn push_unique(values: &mut Vec<String>, value: String) {
211    if !value.is_empty() && !values.iter().any(|existing| existing == &value) {
212      values.push(value);
213    }
214  }
215
216  fn repo_relative_path(repo_dir: &Path, path: &Path) -> Option<String> {
217    let relative_path = if path.is_absolute() {
218      if let Ok(path) = path.strip_prefix(repo_dir) {
219        path
220      } else {
221        let repo_dir = if repo_dir.is_absolute() {
222          repo_dir.to_path_buf()
223        } else {
224          std::env::current_dir().ok()?.join(repo_dir)
225        };
226        let repo_dir = repo_dir.canonicalize().unwrap_or(repo_dir);
227        path.strip_prefix(repo_dir).ok()?
228      }
229    } else {
230      path
231    };
232
233    let mut parts = Vec::new();
234    for component in relative_path.components() {
235      match component {
236        Component::CurDir => {}
237        Component::Normal(part) => parts.push(part.to_string_lossy().into_owned()),
238        _ => return None,
239      }
240    }
241
242    if parts.is_empty() {
243      None
244    } else {
245      Some(parts.join("/"))
246    }
247  }
248
249  /// Loads nested .aiignore files and updates the exclusion patterns
250  async fn load_nested_ignore_files(&mut self) -> Result<()> {
251    if self.options.verbose {
252      println!("Loading ignore patterns...");
253    }
254
255    // Find all .aiignore files
256    let mut ignore_files = Vec::new();
257    self.find_ignore_files(&self.options.dir, &mut ignore_files)?;
258
259    if self.options.verbose {
260      println!("Found {} ignore files", ignore_files.len());
261    }
262
263    // Process each ignore file
264    for ignore_file in ignore_files {
265      if let Ok(content) = fs::read_to_string(&ignore_file).await {
266        let patterns: Vec<String> = content
267          .lines()
268          .map(|line| line.trim())
269          .filter(|line| !line.is_empty() && !line.starts_with('#'))
270          .map(|s| s.to_string())
271          .collect();
272
273        // Get relative patterns based on ignore file location
274        if let Ok(ignore_dir) = ignore_file
275          .parent()
276          .unwrap_or_else(|| Path::new("."))
277          .to_path_buf()
278          .strip_prefix(&self.options.dir)
279        {
280          for pattern in patterns {
281            let relative_pattern = if ignore_dir.as_os_str().is_empty()
282              || pattern.starts_with('/')
283              || pattern.starts_with("**")
284            {
285              pattern
286            } else {
287              format!("{}/{}", ignore_dir.display(), pattern)
288            };
289            self.file_exclusions.push(relative_pattern);
290          }
291        }
292      }
293    }
294
295    // Remove duplicates
296    self.file_exclusions.sort();
297    self.file_exclusions.dedup();
298
299    if self.options.verbose {
300      println!("Total exclusion patterns: {}", self.file_exclusions.len());
301    }
302
303    Ok(())
304  }
305
306  fn find_ignore_files(&self, dir: &Path, results: &mut Vec<PathBuf>) -> Result<()> {
307    use walkdir::WalkDir;
308
309    for entry in WalkDir::new(dir).into_iter().filter_map(|e| e.ok()) {
310      if entry.file_name() == ".aiignore" {
311        results.push(entry.path().to_path_buf());
312      }
313    }
314    Ok(())
315  }
316
317  /// Initializes the generator by loading ignore files
318  async fn initialize(&mut self) -> Result<()> {
319    if !self.initialized {
320      self.load_nested_ignore_files().await?;
321      self.initialized = true;
322    }
323    Ok(())
324  }
325
326  /// Gets tracked files from git, applying exclusions
327  async fn get_tracked_files(&mut self) -> Result<Vec<String>> {
328    self.initialize().await?;
329
330    // Run git ls-files
331    let output = Command::new("git")
332      .arg("ls-files")
333      .current_dir(&self.options.dir)
334      .output()
335      .map_err(|e| anyhow!("Failed to execute git ls-files: {}", e))?;
336
337    if !output.status.success() {
338      return Err(anyhow!("git ls-files failed"));
339    }
340
341    let output_str = String::from_utf8(output.stdout)
342      .map_err(|e| anyhow!("Failed to decode git output: {}", e))?;
343
344    let tracked_files: Vec<String> = output_str
345      .lines()
346      .filter(|line| !line.trim().is_empty())
347      .map(|s| s.to_string())
348      .collect();
349
350    if self.options.verbose {
351      println!("Total tracked files: {}", tracked_files.len());
352    }
353
354    let total_files = tracked_files.len();
355
356    // Filter by exclusions
357    let filtered_files = tracked_files
358      .into_iter()
359      .filter(|file| {
360        let path = Path::new(file);
361        let ext = path
362          .extension()
363          .and_then(|e| e.to_str())
364          .map(|e| format!(".{}", e))
365          .unwrap_or_default();
366
367        // Check if file type is excluded
368        if self.options.file_type_exclusions.contains(&ext) {
369          return false;
370        }
371
372        // Check if file matches exclusion patterns
373        !self.matches_exclusion_patterns(file)
374      })
375      .collect::<Vec<_>>();
376
377    if self.options.verbose {
378      println!("Excluded files: {}", total_files - filtered_files.len());
379      println!(
380        "Files to process after exclusions: {}",
381        filtered_files.len()
382      );
383    }
384
385    Ok(filtered_files)
386  }
387
388  /// Checks if a file path matches any exclusion patterns
389  fn matches_exclusion_patterns(&self, file: &str) -> bool {
390    for pattern in &self.file_exclusions {
391      if self.glob_match(pattern, file) {
392        return true;
393      }
394    }
395    false
396  }
397
398  /// Simple glob pattern matching
399  fn glob_match(&self, pattern: &str, path: &str) -> bool {
400    let pattern = pattern
401      .replace("**", ".*")
402      .replace("*", "[^/]*")
403      .replace("?", "[^/]");
404    let pattern = format!("^{}$", pattern);
405
406    if let Ok(re) = Regex::new(&pattern) {
407      re.is_match(path)
408    } else {
409      false
410    }
411  }
412
413  /// Checks if a file extension is an OCR-able type
414  #[cfg(all(target_os = "macos", feature = "embeddings"))]
415  fn is_ocr_file(ext: &str) -> bool {
416    OCR_FILE_TYPES.contains(&ext)
417  }
418
419  /// Reads and processes file content, using OCR for supported image/PDF types on macOS
420  async fn read_file_content(&self, file_path: &Path) -> Result<String> {
421    #[cfg(all(target_os = "macos", feature = "embeddings"))]
422    {
423      let ext = file_path
424        .extension()
425        .and_then(|e| e.to_str())
426        .map(|e| format!(".{}", e.to_lowercase()))
427        .unwrap_or_default();
428
429      if Self::is_ocr_file(&ext) {
430        return self.read_file_content_ocr(file_path).await;
431      }
432    }
433
434    let content = fs::read_to_string(file_path).await?;
435    let cleaned = clean_and_redact(&content);
436
437    if self.options.verbose && !cleaned.is_empty() {
438      let token_count = count_tokens(&cleaned);
439      println!("{}: Tokens[{}]", file_path.display(), token_count);
440    }
441
442    Ok(cleaned.trim_end().to_string())
443  }
444
445  /// Reads file content via OCR (macOS only)
446  #[cfg(all(target_os = "macos", feature = "embeddings"))]
447  async fn read_file_content_ocr(&self, file_path: &Path) -> Result<String> {
448    use toak_ocr::{AppleOcrEngine, OcrEngine, OcrInput};
449
450    let engine = AppleOcrEngine::new();
451    let input = OcrInput::FilePath(file_path.to_path_buf());
452    let output = engine
453      .recognize(&input)
454      .await
455      .map_err(|e| anyhow!("OCR failed for {}: {}", file_path.display(), e))?;
456
457    if self.options.verbose && !output.text.is_empty() {
458      let token_count = count_tokens(&output.text);
459      println!("{}: Tokens[{}] (OCR)", file_path.display(), token_count);
460    }
461
462    Ok(output.text.trim_end().to_string())
463  }
464
465  /// Generates markdown from all tracked files
466  async fn generate_markdown(&mut self) -> Result<String> {
467    let tracked_files = self.get_tracked_files().await?;
468
469    if self.options.verbose {
470      println!("Generating markdown for {} files", tracked_files.len());
471    }
472
473    let mut markdown = String::from("# Project Files\n\n");
474
475    for file in tracked_files {
476      let absolute_path = self.options.dir.join(&file);
477      match self.read_file_content(&absolute_path).await {
478        Ok(content) => {
479          if !content.trim().is_empty() {
480            markdown.push_str(&format!("## {}\n~~~\n{}\n~~~\n\n", file, content.trim()));
481          } else if self.options.verbose {
482            println!("Skipping {} as it has no content after cleaning.", file);
483          }
484        }
485        Err(e) => {
486          if self.options.verbose {
487            eprintln!("Error reading file {}: {}", file, e);
488          }
489        }
490      }
491    }
492
493    Ok(markdown)
494  }
495
496  /// Reads the todo file, optionally creating it for CLI parity.
497  async fn get_todo(&self) -> Result<Option<String>> {
498    let todo_path = self.options.dir.join("todo");
499
500    if self.options.verbose {
501      println!("Reading todo file");
502    }
503
504    match fs::read_to_string(&todo_path).await {
505      Ok(content) => Ok(Some(content)),
506      Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
507        if !self.behavior.create_todo_file {
508          return Ok(None);
509        }
510
511        if self.options.verbose {
512          println!("File not found, creating a new 'todo' file.");
513        }
514        fs::write(&todo_path, "").await?;
515        Ok(Some(String::new()))
516      }
517      Err(e) => Err(anyhow!("Error reading todo file: {}", e)),
518    }
519  }
520
521  fn gitignore_entries(&self) -> Vec<String> {
522    let mut entries = Vec::new();
523
524    if let Some(output_path) =
525      Self::repo_relative_path(&self.options.dir, &self.options.output_file_path)
526    {
527      Self::push_unique(&mut entries, output_path);
528    }
529
530    if self.behavior.include_todo {
531      Self::push_unique(&mut entries, String::from("todo"));
532    }
533
534    if self.behavior.include_embeddings_artifact {
535      Self::push_unique(&mut entries, String::from("embeddings.json"));
536    }
537
538    entries
539  }
540
541  /// Updates .gitignore to include generated artifacts managed by this run.
542  async fn update_gitignore(&self) -> Result<()> {
543    let entries = self.gitignore_entries();
544    if entries.is_empty() {
545      return Ok(());
546    }
547
548    let gitignore_path = self.options.dir.join(".gitignore");
549
550    let content = match fs::read_to_string(&gitignore_path).await {
551      Ok(c) => c,
552      Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
553        if self.options.verbose {
554          println!("File not found, creating a '.gitignore' file.");
555        }
556        String::new()
557      }
558      Err(e) => return Err(anyhow!("Error reading .gitignore: {}", e)),
559    };
560
561    let lines: Vec<&str> = content.lines().map(|l| l.trim()).collect();
562    let missing_entries = entries
563      .into_iter()
564      .filter(|entry| !lines.contains(&entry.as_str()))
565      .collect::<Vec<_>>();
566
567    if !missing_entries.is_empty() {
568      if self.options.verbose {
569        println!("Updating .gitignore with generated files");
570      }
571
572      let mut new_content = content;
573      if !new_content.is_empty() && !new_content.ends_with('\n') {
574        new_content.push('\n');
575      }
576
577      for entry in missing_entries {
578        new_content.push_str(&entry);
579        new_content.push('\n');
580      }
581
582      fs::write(&gitignore_path, new_content).await?;
583    }
584
585    Ok(())
586  }
587
588  /// Creates the complete markdown document that combines code snippets with todo notes.
589  pub async fn create_markdown_document(&mut self) -> Result<MarkdownResult> {
590    let code_markdown = self.generate_markdown().await?;
591    let markdown = if self.behavior.include_todo {
592      match self.get_todo().await? {
593        Some(todos) => format!("{}\n---\n\n{}\n", code_markdown, todos),
594        None => code_markdown,
595      }
596    } else {
597      code_markdown
598    };
599
600    if self.behavior.update_gitignore {
601      self.update_gitignore().await?;
602    }
603
604    let token_count = count_tokens(&markdown);
605
606    if self.options.verbose {
607      println!(
608        "Markdown document created at {}",
609        self.options.output_file_path.display()
610      );
611      println!("{{ \"total_tokens\": {} }}", token_count);
612    }
613
614    fs::write(&self.options.output_file_path, &markdown).await?;
615
616    Ok(MarkdownResult {
617      success: true,
618      token_count: Some(token_count),
619      error: None,
620    })
621  }
622}
623
624/// Result returned after a markdown generation run.
625#[derive(Debug, Clone)]
626#[allow(dead_code)]
627pub struct MarkdownResult {
628  pub success: bool,
629  pub token_count: Option<usize>,
630  pub error: Option<String>,
631}