ferriclink_core/
documents.rs

1//! Document types for FerricLink Core
2//!
3//! This module provides document abstractions for handling text and structured data
4//! in the FerricLink ecosystem.
5
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8
9use crate::errors::Result;
10use crate::impl_serializable;
11
12/// A document represents a piece of text with associated metadata
13#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
14pub struct Document {
15    /// The text content of the document
16    pub page_content: String,
17    /// Metadata associated with the document
18    #[serde(default)]
19    pub metadata: HashMap<String, serde_json::Value>,
20}
21
22impl Document {
23    /// Create a new document with the given content
24    pub fn new(page_content: impl Into<String>) -> Self {
25        Self {
26            page_content: page_content.into(),
27            metadata: HashMap::new(),
28        }
29    }
30
31    /// Create a new document with content and metadata
32    pub fn new_with_metadata(
33        page_content: impl Into<String>,
34        metadata: HashMap<String, serde_json::Value>,
35    ) -> Self {
36        Self {
37            page_content: page_content.into(),
38            metadata,
39        }
40    }
41
42    /// Add metadata to the document
43    pub fn add_metadata(&mut self, key: impl Into<String>, value: serde_json::Value) {
44        self.metadata.insert(key.into(), value);
45    }
46
47    /// Get metadata value by key
48    pub fn get_metadata(&self, key: &str) -> Option<&serde_json::Value> {
49        self.metadata.get(key)
50    }
51
52    /// Get metadata value as a string
53    pub fn get_metadata_string(&self, key: &str) -> Option<String> {
54        self.get_metadata(key)
55            .and_then(|v| v.as_str().map(|s| s.to_string()))
56    }
57
58    /// Get metadata value as a number
59    pub fn get_metadata_number(&self, key: &str) -> Option<f64> {
60        self.get_metadata(key).and_then(|v| v.as_f64())
61    }
62
63    /// Get metadata value as a boolean
64    pub fn get_metadata_bool(&self, key: &str) -> Option<bool> {
65        self.get_metadata(key).and_then(|v| v.as_bool())
66    }
67
68    /// Check if the document has a specific metadata key
69    pub fn has_metadata(&self, key: &str) -> bool {
70        self.metadata.contains_key(key)
71    }
72
73    /// Get the length of the document content
74    pub fn len(&self) -> usize {
75        self.page_content.len()
76    }
77
78    /// Check if the document is empty
79    pub fn is_empty(&self) -> bool {
80        self.page_content.is_empty()
81    }
82
83    /// Split the document into chunks
84    pub fn split(&self, chunk_size: usize, overlap: usize) -> Vec<Document> {
85        if self.page_content.len() <= chunk_size {
86            return vec![self.clone()];
87        }
88
89        let mut chunks = Vec::new();
90        let mut start = 0;
91
92        while start < self.page_content.len() {
93            let end = (start + chunk_size).min(self.page_content.len());
94            let chunk_content = self.page_content[start..end].to_string();
95
96            let mut chunk_metadata = self.metadata.clone();
97            chunk_metadata.insert(
98                "chunk_index".to_string(),
99                serde_json::Value::Number(serde_json::Number::from(chunks.len())),
100            );
101            chunk_metadata.insert(
102                "chunk_start".to_string(),
103                serde_json::Value::Number(serde_json::Number::from(start)),
104            );
105            chunk_metadata.insert(
106                "chunk_end".to_string(),
107                serde_json::Value::Number(serde_json::Number::from(end)),
108            );
109
110            chunks.push(Document {
111                page_content: chunk_content,
112                metadata: chunk_metadata,
113            });
114
115            if end >= self.page_content.len() {
116                break;
117            }
118
119            start = end.saturating_sub(overlap);
120        }
121
122        chunks
123    }
124
125    /// Join multiple documents into one
126    pub fn join(documents: &[Document], separator: &str) -> Document {
127        let mut content = String::new();
128        let mut metadata = HashMap::new();
129
130        for (i, doc) in documents.iter().enumerate() {
131            if i > 0 {
132                content.push_str(separator);
133            }
134            content.push_str(&doc.page_content);
135
136            // Merge metadata, with later documents taking precedence
137            for (key, value) in &doc.metadata {
138                metadata.insert(key.clone(), value.clone());
139            }
140        }
141
142        metadata.insert(
143            "source_documents".to_string(),
144            serde_json::Value::Number(serde_json::Number::from(documents.len())),
145        );
146
147        Document {
148            page_content: content,
149            metadata,
150        }
151    }
152}
153
154impl_serializable!(Document, ["ferriclink", "documents", "document"]);
155
156/// A collection of documents
157#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
158pub struct DocumentCollection {
159    /// The documents in the collection
160    pub documents: Vec<Document>,
161    /// Metadata about the collection
162    #[serde(default)]
163    pub metadata: HashMap<String, serde_json::Value>,
164}
165
166impl DocumentCollection {
167    /// Create a new empty document collection
168    pub fn new() -> Self {
169        Self {
170            documents: Vec::new(),
171            metadata: HashMap::new(),
172        }
173    }
174
175    /// Create a new document collection with documents
176    pub fn new_with_documents(documents: Vec<Document>) -> Self {
177        Self {
178            documents,
179            metadata: HashMap::new(),
180        }
181    }
182
183    /// Add a document to the collection
184    pub fn add_document(&mut self, document: Document) {
185        self.documents.push(document);
186    }
187
188    /// Add multiple documents to the collection
189    pub fn add_documents(&mut self, documents: Vec<Document>) {
190        self.documents.extend(documents);
191    }
192
193    /// Get the number of documents in the collection
194    pub fn len(&self) -> usize {
195        self.documents.len()
196    }
197
198    /// Check if the collection is empty
199    pub fn is_empty(&self) -> bool {
200        self.documents.is_empty()
201    }
202
203    /// Get a document by index
204    pub fn get(&self, index: usize) -> Option<&Document> {
205        self.documents.get(index)
206    }
207
208    /// Get all documents
209    pub fn documents(&self) -> &[Document] {
210        &self.documents
211    }
212
213    /// Get the total length of all documents
214    pub fn total_length(&self) -> usize {
215        self.documents.iter().map(|doc| doc.len()).sum()
216    }
217
218    /// Split all documents into chunks
219    pub fn split_all(&self, chunk_size: usize, overlap: usize) -> DocumentCollection {
220        let mut chunks = Vec::new();
221
222        for doc in &self.documents {
223            chunks.extend(doc.split(chunk_size, overlap));
224        }
225
226        DocumentCollection {
227            documents: chunks,
228            metadata: self.metadata.clone(),
229        }
230    }
231
232    /// Filter documents based on a predicate
233    pub fn filter<F>(&self, predicate: F) -> DocumentCollection
234    where
235        F: Fn(&Document) -> bool,
236    {
237        DocumentCollection {
238            documents: self
239                .documents
240                .iter()
241                .filter(|doc| predicate(doc))
242                .cloned()
243                .collect(),
244            metadata: self.metadata.clone(),
245        }
246    }
247
248    /// Map documents using a function
249    pub fn map<F>(&self, mapper: F) -> DocumentCollection
250    where
251        F: Fn(&Document) -> Document,
252    {
253        DocumentCollection {
254            documents: self.documents.iter().map(mapper).collect(),
255            metadata: self.metadata.clone(),
256        }
257    }
258}
259
260impl Default for DocumentCollection {
261    fn default() -> Self {
262        Self::new()
263    }
264}
265
266impl_serializable!(
267    DocumentCollection,
268    ["ferriclink", "documents", "collection"]
269);
270
271/// Trait for objects that can be converted to documents
272pub trait ToDocument {
273    /// Convert this object to a document
274    fn to_document(&self) -> Document;
275}
276
277/// Trait for objects that can be converted from documents
278pub trait FromDocument {
279    /// Convert a document to this object
280    fn from_document(document: &Document) -> Result<Self>
281    where
282        Self: Sized;
283}
284
285impl ToDocument for str {
286    fn to_document(&self) -> Document {
287        Document::new(self)
288    }
289}
290
291impl ToDocument for String {
292    fn to_document(&self) -> Document {
293        Document::new(self)
294    }
295}
296
297impl ToDocument for Document {
298    fn to_document(&self) -> Document {
299        self.clone()
300    }
301}
302
303#[cfg(test)]
304mod tests {
305    use super::*;
306    use crate::serializable::Serializable;
307
308    #[test]
309    fn test_document_creation() {
310        let doc = Document::new("Hello, world!");
311        assert_eq!(doc.page_content, "Hello, world!");
312        assert!(doc.metadata.is_empty());
313    }
314
315    #[test]
316    fn test_document_with_metadata() {
317        let mut metadata = HashMap::new();
318        metadata.insert(
319            "source".to_string(),
320            serde_json::Value::String("test".to_string()),
321        );
322
323        let doc = Document::new_with_metadata("Hello, world!", metadata);
324        assert_eq!(doc.page_content, "Hello, world!");
325        assert_eq!(doc.get_metadata_string("source"), Some("test".to_string()));
326    }
327
328    #[test]
329    fn test_document_metadata_operations() {
330        let mut doc = Document::new("Test content");
331
332        doc.add_metadata("key1", serde_json::Value::String("value1".to_string()));
333        doc.add_metadata(
334            "key2",
335            serde_json::Value::Number(serde_json::Number::from(42)),
336        );
337        doc.add_metadata("key3", serde_json::Value::Bool(true));
338
339        assert_eq!(doc.get_metadata_string("key1"), Some("value1".to_string()));
340        assert_eq!(doc.get_metadata_number("key2"), Some(42.0));
341        assert_eq!(doc.get_metadata_bool("key3"), Some(true));
342        assert!(doc.has_metadata("key1"));
343        assert!(!doc.has_metadata("nonexistent"));
344    }
345
346    #[test]
347    fn test_document_split() {
348        let doc =
349            Document::new("This is a test document that should be split into multiple chunks.");
350        let chunks = doc.split(20, 5);
351
352        assert!(chunks.len() > 1);
353        assert_eq!(chunks[0].get_metadata_number("chunk_index"), Some(0.0));
354        assert_eq!(chunks[1].get_metadata_number("chunk_index"), Some(1.0));
355    }
356
357    #[test]
358    fn test_document_join() {
359        let doc1 = Document::new("First document");
360        let doc2 = Document::new("Second document");
361        let joined = Document::join(&[doc1, doc2], " | ");
362
363        assert_eq!(joined.page_content, "First document | Second document");
364        assert_eq!(joined.get_metadata_number("source_documents"), Some(2.0));
365    }
366
367    #[test]
368    fn test_document_collection() {
369        let mut collection = DocumentCollection::new();
370
371        collection.add_document(Document::new("Doc 1"));
372        collection.add_document(Document::new("Doc 2"));
373
374        assert_eq!(collection.len(), 2);
375        assert!(!collection.is_empty());
376        assert_eq!(collection.total_length(), 10); // "Doc 1" + "Doc 2"
377    }
378
379    #[test]
380    fn test_document_collection_operations() {
381        let docs = vec![
382            Document::new("Short"),
383            Document::new("This is a longer document"),
384        ];
385        let collection = DocumentCollection::new_with_documents(docs);
386
387        let filtered = collection.filter(|doc| doc.len() > 10);
388        assert_eq!(filtered.len(), 1);
389
390        let mapped =
391            collection.map(|doc| Document::new(format!("Processed: {}", doc.page_content)));
392        assert_eq!(mapped.len(), 2);
393        assert!(mapped.documents[0].page_content.starts_with("Processed:"));
394    }
395
396    #[test]
397    fn test_to_document_trait() {
398        let doc1 = "Hello".to_document();
399        let doc2 = "World".to_string().to_document();
400        let doc3 = doc1.clone().to_document();
401
402        assert_eq!(doc1.page_content, "Hello");
403        assert_eq!(doc2.page_content, "World");
404        assert_eq!(doc3.page_content, "Hello");
405    }
406
407    #[test]
408    fn test_serialization() {
409        let doc = Document::new("Test content");
410        let json = doc.to_json().unwrap();
411        let deserialized: Document = Document::from_json(&json).unwrap();
412        assert_eq!(doc, deserialized);
413    }
414}