Skip to main content

toak_rs/
lib.rs

1//! # toak-rs
2//!
3//! A high-performance library for tokenizing git repositories, generating markdown documentation,
4//! and creating semantic embeddings for code repositories.
5//!
6//! ## Features
7//!
8//! - **Code Cleaning & Secret Redaction**: Remove comments, imports, and sensitive information (API keys, tokens, passwords)
9//! - **Tokenization**: Count tokens in code for LLM context window estimation
10//! - **Text Chunking**: Split text into overlapping chunks optimized for embeddings and RAG applications
11//! - **Embeddings Generation**: Create semantic vector embeddings for code chunks
12//! - **Markdown Generation**: Convert repositories into well-structured markdown documentation
13//! - **High Performance**: Built in Rust with concurrent file processing and no runtime dependencies
14//!
15//! ## Quick Start
16//!
17//! ```ignore
18//! use toak_rs::prelude::*;
19//!
20//! // Clean and redact code
21//! let cleaned = clean_and_redact("let api_key = 'sk-1234567890';");
22//! assert!(!cleaned.contains("sk-"));
23//!
24//! // Generate embeddings
25//! let mut generator = EmbeddingsGenerator::new()?;
26//! let embedding = generator.generate_embedding("let x = 5;")?;
27//!
28//! // Chunk text for RAG
29//! let chunks = chunk_text("Hello world", &ChunkerConfig::default());
30//!
31//! // Perform semantic search on embeddings
32//! let mut search = SemanticSearch::new("embeddings.json")?;
33//! let results = search.search("find rust code", 5)?;
34//! for result in results {
35//!     println!("{}: {:.4}", result.file_path, result.similarity);
36//! }
37//! ```
38
39#[cfg(feature = "embeddings")]
40pub mod embeddings_generator;
41#[cfg(feature = "embeddings")]
42pub mod json_database_generator;
43pub mod markdown_generator;
44#[cfg(feature = "embeddings")]
45pub mod semantic_search;
46pub mod text_chunker;
47pub mod token_cleaner;
48
49// Re-export commonly used types at the root level
50#[cfg(feature = "embeddings")]
51pub use embeddings_generator::EmbeddingsGenerator;
52#[cfg(feature = "embeddings")]
53pub use json_database_generator::{ChunkMetadata, EmbeddedChunk, EmbeddingsDatabase, JsonDatabaseGenerator, JsonDatabaseOptions, JsonDatabaseResult};
54pub use markdown_generator::{MarkdownGenerator, MarkdownGeneratorOptions, MarkdownResult};
55#[cfg(feature = "embeddings")]
56pub use semantic_search::{EmbeddingChunk, EmbeddingsDatabaseMetadata, SearchResult, SemanticSearch};
57pub use text_chunker::{chunk_text, ChunkerConfig, TextChunk};
58pub use token_cleaner::{clean_and_redact, clean_code, count_tokens, redact_secrets};
59
60/// Prelude module for convenient imports
61///
62/// Import everything you need with:
63/// ```ignore
64/// use toak_rs::prelude::*;
65/// ```
66pub mod prelude {
67    pub use crate::{
68        chunk_text, clean_and_redact, clean_code, count_tokens, redact_secrets, ChunkerConfig,
69        MarkdownGenerator, MarkdownGeneratorOptions, MarkdownResult, TextChunk,
70    };
71
72    #[cfg(feature = "embeddings")]
73    pub use crate::{
74        ChunkMetadata, EmbeddedChunk, EmbeddingChunk, EmbeddingsDatabase, EmbeddingsDatabaseMetadata,
75        EmbeddingsGenerator, JsonDatabaseGenerator, JsonDatabaseOptions, JsonDatabaseResult,
76        SearchResult, SemanticSearch,
77    };
78}