1use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8
9use crate::errors::Result;
10use crate::impl_serializable;
11
12#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
14pub struct Document {
15 pub page_content: String,
17 #[serde(default)]
19 pub metadata: HashMap<String, serde_json::Value>,
20}
21
22impl Document {
23 pub fn new(page_content: impl Into<String>) -> Self {
25 Self {
26 page_content: page_content.into(),
27 metadata: HashMap::new(),
28 }
29 }
30
31 pub fn new_with_metadata(
33 page_content: impl Into<String>,
34 metadata: HashMap<String, serde_json::Value>,
35 ) -> Self {
36 Self {
37 page_content: page_content.into(),
38 metadata,
39 }
40 }
41
42 pub fn add_metadata(&mut self, key: impl Into<String>, value: serde_json::Value) {
44 self.metadata.insert(key.into(), value);
45 }
46
47 pub fn get_metadata(&self, key: &str) -> Option<&serde_json::Value> {
49 self.metadata.get(key)
50 }
51
52 pub fn get_metadata_string(&self, key: &str) -> Option<String> {
54 self.get_metadata(key)
55 .and_then(|v| v.as_str().map(|s| s.to_string()))
56 }
57
58 pub fn get_metadata_number(&self, key: &str) -> Option<f64> {
60 self.get_metadata(key).and_then(|v| v.as_f64())
61 }
62
63 pub fn get_metadata_bool(&self, key: &str) -> Option<bool> {
65 self.get_metadata(key).and_then(|v| v.as_bool())
66 }
67
68 pub fn has_metadata(&self, key: &str) -> bool {
70 self.metadata.contains_key(key)
71 }
72
73 pub fn len(&self) -> usize {
75 self.page_content.len()
76 }
77
78 pub fn is_empty(&self) -> bool {
80 self.page_content.is_empty()
81 }
82
83 pub fn split(&self, chunk_size: usize, overlap: usize) -> Vec<Document> {
85 if self.page_content.len() <= chunk_size {
86 return vec![self.clone()];
87 }
88
89 let mut chunks = Vec::new();
90 let mut start = 0;
91
92 while start < self.page_content.len() {
93 let end = (start + chunk_size).min(self.page_content.len());
94 let chunk_content = self.page_content[start..end].to_string();
95
96 let mut chunk_metadata = self.metadata.clone();
97 chunk_metadata.insert(
98 "chunk_index".to_string(),
99 serde_json::Value::Number(serde_json::Number::from(chunks.len())),
100 );
101 chunk_metadata.insert(
102 "chunk_start".to_string(),
103 serde_json::Value::Number(serde_json::Number::from(start)),
104 );
105 chunk_metadata.insert(
106 "chunk_end".to_string(),
107 serde_json::Value::Number(serde_json::Number::from(end)),
108 );
109
110 chunks.push(Document {
111 page_content: chunk_content,
112 metadata: chunk_metadata,
113 });
114
115 if end >= self.page_content.len() {
116 break;
117 }
118
119 start = end.saturating_sub(overlap);
120 }
121
122 chunks
123 }
124
125 pub fn join(documents: &[Document], separator: &str) -> Document {
127 let mut content = String::new();
128 let mut metadata = HashMap::new();
129
130 for (i, doc) in documents.iter().enumerate() {
131 if i > 0 {
132 content.push_str(separator);
133 }
134 content.push_str(&doc.page_content);
135
136 for (key, value) in &doc.metadata {
138 metadata.insert(key.clone(), value.clone());
139 }
140 }
141
142 metadata.insert(
143 "source_documents".to_string(),
144 serde_json::Value::Number(serde_json::Number::from(documents.len())),
145 );
146
147 Document {
148 page_content: content,
149 metadata,
150 }
151 }
152}
153
154impl_serializable!(Document, ["ferriclink", "documents", "document"]);
155
156#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
158pub struct DocumentCollection {
159 pub documents: Vec<Document>,
161 #[serde(default)]
163 pub metadata: HashMap<String, serde_json::Value>,
164}
165
166impl DocumentCollection {
167 pub fn new() -> Self {
169 Self {
170 documents: Vec::new(),
171 metadata: HashMap::new(),
172 }
173 }
174
175 pub fn new_with_documents(documents: Vec<Document>) -> Self {
177 Self {
178 documents,
179 metadata: HashMap::new(),
180 }
181 }
182
183 pub fn add_document(&mut self, document: Document) {
185 self.documents.push(document);
186 }
187
188 pub fn add_documents(&mut self, documents: Vec<Document>) {
190 self.documents.extend(documents);
191 }
192
193 pub fn len(&self) -> usize {
195 self.documents.len()
196 }
197
198 pub fn is_empty(&self) -> bool {
200 self.documents.is_empty()
201 }
202
203 pub fn get(&self, index: usize) -> Option<&Document> {
205 self.documents.get(index)
206 }
207
208 pub fn documents(&self) -> &[Document] {
210 &self.documents
211 }
212
213 pub fn total_length(&self) -> usize {
215 self.documents.iter().map(|doc| doc.len()).sum()
216 }
217
218 pub fn split_all(&self, chunk_size: usize, overlap: usize) -> DocumentCollection {
220 let mut chunks = Vec::new();
221
222 for doc in &self.documents {
223 chunks.extend(doc.split(chunk_size, overlap));
224 }
225
226 DocumentCollection {
227 documents: chunks,
228 metadata: self.metadata.clone(),
229 }
230 }
231
232 pub fn filter<F>(&self, predicate: F) -> DocumentCollection
234 where
235 F: Fn(&Document) -> bool,
236 {
237 DocumentCollection {
238 documents: self
239 .documents
240 .iter()
241 .filter(|doc| predicate(doc))
242 .cloned()
243 .collect(),
244 metadata: self.metadata.clone(),
245 }
246 }
247
248 pub fn map<F>(&self, mapper: F) -> DocumentCollection
250 where
251 F: Fn(&Document) -> Document,
252 {
253 DocumentCollection {
254 documents: self.documents.iter().map(mapper).collect(),
255 metadata: self.metadata.clone(),
256 }
257 }
258}
259
260impl Default for DocumentCollection {
261 fn default() -> Self {
262 Self::new()
263 }
264}
265
266impl_serializable!(
267 DocumentCollection,
268 ["ferriclink", "documents", "collection"]
269);
270
271pub trait ToDocument {
273 fn to_document(&self) -> Document;
275}
276
277pub trait FromDocument {
279 fn from_document(document: &Document) -> Result<Self>
281 where
282 Self: Sized;
283}
284
285impl ToDocument for str {
286 fn to_document(&self) -> Document {
287 Document::new(self)
288 }
289}
290
291impl ToDocument for String {
292 fn to_document(&self) -> Document {
293 Document::new(self)
294 }
295}
296
297impl ToDocument for Document {
298 fn to_document(&self) -> Document {
299 self.clone()
300 }
301}
302
303#[cfg(test)]
304mod tests {
305 use super::*;
306 use crate::serializable::Serializable;
307
308 #[test]
309 fn test_document_creation() {
310 let doc = Document::new("Hello, world!");
311 assert_eq!(doc.page_content, "Hello, world!");
312 assert!(doc.metadata.is_empty());
313 }
314
315 #[test]
316 fn test_document_with_metadata() {
317 let mut metadata = HashMap::new();
318 metadata.insert(
319 "source".to_string(),
320 serde_json::Value::String("test".to_string()),
321 );
322
323 let doc = Document::new_with_metadata("Hello, world!", metadata);
324 assert_eq!(doc.page_content, "Hello, world!");
325 assert_eq!(doc.get_metadata_string("source"), Some("test".to_string()));
326 }
327
328 #[test]
329 fn test_document_metadata_operations() {
330 let mut doc = Document::new("Test content");
331
332 doc.add_metadata("key1", serde_json::Value::String("value1".to_string()));
333 doc.add_metadata(
334 "key2",
335 serde_json::Value::Number(serde_json::Number::from(42)),
336 );
337 doc.add_metadata("key3", serde_json::Value::Bool(true));
338
339 assert_eq!(doc.get_metadata_string("key1"), Some("value1".to_string()));
340 assert_eq!(doc.get_metadata_number("key2"), Some(42.0));
341 assert_eq!(doc.get_metadata_bool("key3"), Some(true));
342 assert!(doc.has_metadata("key1"));
343 assert!(!doc.has_metadata("nonexistent"));
344 }
345
346 #[test]
347 fn test_document_split() {
348 let doc =
349 Document::new("This is a test document that should be split into multiple chunks.");
350 let chunks = doc.split(20, 5);
351
352 assert!(chunks.len() > 1);
353 assert_eq!(chunks[0].get_metadata_number("chunk_index"), Some(0.0));
354 assert_eq!(chunks[1].get_metadata_number("chunk_index"), Some(1.0));
355 }
356
357 #[test]
358 fn test_document_join() {
359 let doc1 = Document::new("First document");
360 let doc2 = Document::new("Second document");
361 let joined = Document::join(&[doc1, doc2], " | ");
362
363 assert_eq!(joined.page_content, "First document | Second document");
364 assert_eq!(joined.get_metadata_number("source_documents"), Some(2.0));
365 }
366
367 #[test]
368 fn test_document_collection() {
369 let mut collection = DocumentCollection::new();
370
371 collection.add_document(Document::new("Doc 1"));
372 collection.add_document(Document::new("Doc 2"));
373
374 assert_eq!(collection.len(), 2);
375 assert!(!collection.is_empty());
376 assert_eq!(collection.total_length(), 10); }
378
379 #[test]
380 fn test_document_collection_operations() {
381 let docs = vec![
382 Document::new("Short"),
383 Document::new("This is a longer document"),
384 ];
385 let collection = DocumentCollection::new_with_documents(docs);
386
387 let filtered = collection.filter(|doc| doc.len() > 10);
388 assert_eq!(filtered.len(), 1);
389
390 let mapped =
391 collection.map(|doc| Document::new(format!("Processed: {}", doc.page_content)));
392 assert_eq!(mapped.len(), 2);
393 assert!(mapped.documents[0].page_content.starts_with("Processed:"));
394 }
395
396 #[test]
397 fn test_to_document_trait() {
398 let doc1 = "Hello".to_document();
399 let doc2 = "World".to_string().to_document();
400 let doc3 = doc1.clone().to_document();
401
402 assert_eq!(doc1.page_content, "Hello");
403 assert_eq!(doc2.page_content, "World");
404 assert_eq!(doc3.page_content, "Hello");
405 }
406
407 #[test]
408 fn test_serialization() {
409 let doc = Document::new("Test content");
410 let json = doc.to_json().unwrap();
411 let deserialized: Document = Document::from_json(&json).unwrap();
412 assert_eq!(doc, deserialized);
413 }
414}