1
use crate::{index::index_path::IndexPath, server_error::MyError, urn::blob_to_uri};
2
use chrono::{DateTime, Utc};
3
use lazy_static::lazy_static;
4
use regex::Regex;
5
use serde::Serialize;
6
use sophia::{
7
    api::{
8
        dataset::{self, MutableDataset},
9
        graph::{CollectibleGraph, Graph},
10
        parser::TripleParser,
11
        prefix::{Prefix, PrefixMapPair},
12
        term::{matcher::Any, IriRef, SimpleTerm},
13
        triple::Triple,
14
        MownStr,
15
    },
16
    inmem::{dataset::FastDataset, graph::LightGraph},
17
    iri::Iri,
18
    turtle::parser::{nt::NTriplesParser, turtle::TurtleParser},
19
    xml::parser::RdfXmlParser,
20
};
21
use std::{
22
    collections::{BTreeSet, HashMap},
23
    error::Error,
24
    path::Path,
25
};
26

            
27
pub(crate) struct Dataset {
28
    pub id: String,
29
    pub versions: Vec<DatasetVersion>,
30
}
31

            
32
impl Dataset {
33
6
    pub fn latest_version(&self) -> Option<DateTime<Utc>> {
34
8
        self.versions.iter().map(|v| &v.date).max().copied()
35
6
    }
36
}
37

            
38
pub(crate) struct DatasetVersion {
39
    pub date: DateTime<Utc>,
40
    pub data: FastDataset,
41
}
42

            
43
#[derive(Serialize)]
44
pub(crate) struct PredicateObject<'a> {
45
    pub predicate: &'a IriRef<MownStr<'a>>,
46
    pub object_iri: Option<&'a IriRef<MownStr<'a>>>,
47
    pub object_literal: Option<&'a str>,
48
}
49

            
50
impl DatasetVersion {
51
4
    pub fn number_of_quads(&self) -> usize {
52
4
        sophia::api::prelude::Dataset::quads(&self.data).count()
53
4
    }
54
1
    pub fn subjects(&self) -> BTreeSet<&IriRef<MownStr>> {
55
1
        use dataset::Dataset;
56
1
        let subjects: BTreeSet<_> = self
57
1
            .data
58
1
            .subjects()
59
666
            .filter_map(|t| match t {
60
666
                Ok(SimpleTerm::Iri(iri)) => Some(iri),
61
                _ => None,
62
666
            })
63
1
            .collect();
64
1
        subjects
65
1
    }
66
    // return predicate and object for the given subject
67
124
    pub fn pos(&self, subject: &str) -> Vec<PredicateObject> {
68
124
        use dataset::Dataset;
69
124
        let subject = IriRef::new(subject.to_string()).unwrap();
70
124
        let pos: Vec<_> = self
71
124
            .data
72
124
            .quads_matching([subject], Any, Any, Any)
73
681
            .filter_map(|t| t.ok())
74
681
            .map(|t| (t.1[1], t.1[2]))
75
681
            .filter_map(|(p, o)| match p {
76
681
                SimpleTerm::Iri(p) => Some(PredicateObject {
77
681
                    predicate: p,
78
681
                    object_iri: match o {
79
438
                        SimpleTerm::Iri(o) => Some(o),
80
243
                        _ => None,
81
                    },
82
681
                    object_literal: match o {
83
243
                        SimpleTerm::LiteralDatatype(value, _) => Some(value),
84
                        SimpleTerm::LiteralLanguage(value, _) => Some(value),
85
438
                        _ => None,
86
                    },
87
                }),
88
                _ => None,
89
681
            })
90
124
            .collect();
91
124
        pos
92
124
    }
93
}
94

            
95
/**
96
 * Read each directory under `dir` as a dataset.
97
 */
98
18
pub(crate) fn load_datasets<P: AsRef<Path>>(dir: P) -> Result<Vec<Dataset>, Box<dyn Error>> {
99
18
    let paths = std::fs::read_dir(dir)?;
100
18
    let mut datasets = Vec::new();
101
36
    for path in paths.filter_map(|p| p.ok()) {
102
        // skip entries that are not directories
103
36
        if !path.file_type().map(|ft| ft.is_dir()).unwrap_or_default() {
104
            continue;
105
36
        }
106
36
        datasets.push(Dataset {
107
36
            id: path.file_name().to_string_lossy().to_string(),
108
36
            versions: load_dataset_versions(&path.path())?,
109
        });
110
    }
111
18
    Ok(datasets)
112
18
}
113

            
114
/**
115
 * Read each directory under `dir` as a version of the dataset.
116
 */
117
36
fn load_dataset_versions(dir: &Path) -> Result<Vec<DatasetVersion>, Box<dyn Error>> {
118
36
    let mut versions = Vec::new();
119
46
    for path in std::fs::read_dir(dir)?.filter_map(|p| p.ok()) {
120
46
        if let Ok(index_path) = IndexPath::try_from(path.path()) {
121
46
            versions.push(DatasetVersion {
122
46
                date: index_path.date(),
123
46
                data: load_dataset_version(&index_path)?,
124
            })
125
        }
126
    }
127
36
    versions.sort_unstable_by(|a, b| b.date.cmp(&a.date));
128
36
    Ok(versions)
129
36
}
130

            
131
/**
132
 * Parse a string into a graph with the given base.
133
 * The graph can be in either Turtle (ttl), RDF/XML or NTriples format.
134
 */
135
189
pub(crate) fn parse_graph(rdf: &str, base: Option<Iri<String>>) -> Result<LightGraph, MyError> {
136
189
    let parser = TurtleParser { base: base.clone() };
137
189
    let source = parser.parse_str(rdf);
138
189
    if let Ok(graph) = LightGraph::from_triple_source(source) {
139
189
        return Ok(graph);
140
    }
141
    let parser = RdfXmlParser { base };
142
    let source = parser.parse_str(rdf);
143
    if let Ok(graph) = LightGraph::from_triple_source(source) {
144
        return Ok(graph);
145
    }
146
    let parser = NTriplesParser {};
147
    let source = parser.parse_str(rdf);
148
    Ok(LightGraph::from_triple_source(source)?)
149
189
}
150

            
151
/**
152
 * Load the dataset for a version.
153
 * This is done by reading all the files in the directory and inserting them
154
 * in a [`dataset::Dataset`].
155
 */
156
46
fn load_dataset_version(dir: &IndexPath) -> Result<FastDataset, Box<dyn Error>> {
157
46
    let mut dataset = FastDataset::new();
158
184
    for path in dir.file_paths()? {
159
184
        let rdf = std::fs::read_to_string(path)?;
160
184
        let urn = blob_to_uri(rdf.as_bytes());
161
184
        let base_ref = IriRef::new(urn.clone())?;
162
184
        let base: Iri<String> = Iri::new(urn)?;
163
184
        let graph = parse_graph(&rdf, Some(base))?;
164
30636
        for t in graph.triples() {
165
30636
            let t = t?;
166
30636
            dataset.insert(t.s(), t.p(), t.o(), Some(base_ref.clone()))?;
167
        }
168
    }
169
46
    Ok(dataset)
170
46
}
171

            
172
lazy_static! {
173
    static ref PREFIX_RE: Regex = Regex::new(r"@prefix ([a-zA-Z0-9]+):\s+<(.+)>\s\.").unwrap();
174
}
175

            
176
1
pub(crate) fn load_prefixes<P: AsRef<Path>>(dir: P) -> Result<Vec<PrefixMapPair>, Box<dyn Error>> {
177
1
    let mut prefixes: HashMap<String, String> = HashMap::new();
178

            
179
1
    let paths = std::fs::read_dir(dir)?;
180
    // Dataset directories
181
2
    for path in paths.filter_map(|p| p.ok()) {
182
2
        if !path.file_type().map(|ft| ft.is_dir()).unwrap_or_default() {
183
            continue;
184
2
        }
185
        // Dataset version directories
186
2
        for path in std::fs::read_dir(&path.path())?.filter_map(|p| p.ok()) {
187
2
            if let Ok(index_path) = IndexPath::try_from(path.path()) {
188
                // RDF files
189
8
                for path in index_path.file_paths()? {
190
8
                    let rdf = std::fs::read_to_string(path)?;
191

            
192
                    // Find prefixes in RDF files
193
30
                    for (_, [prefix, url]) in PREFIX_RE.captures_iter(&rdf).map(|c| c.extract()) {
194
30
                        match prefixes.insert(prefix.to_string(), url.to_string()) {
195
7
                            None => (),
196
23
                            Some(old_url) => {
197
23
                                if url != old_url {
198
                                    // TODO Make this nicer
199
                                    panic!("Prefix \"{prefix}\" was defined twice with different values");
200
23
                                }
201
                            }
202
                        }
203
                    }
204
                }
205
            }
206
        }
207
    }
208
1
    Ok(prefixes
209
1
        .into_iter()
210
7
        .map(|(prefix, url)| {
211
7
            (
212
7
                Prefix::new_unchecked(prefix.into()),
213
7
                Iri::new_unchecked(url.into()),
214
7
            )
215
7
        })
216
1
        .collect())
217
1
}
218

            
219
1
pub(crate) fn get_pos_label(pos: &Vec<PredicateObject>) -> Option<String> {
220
1
    let target = "http://www.w3.org/2000/01/rdf-schema#label";
221
1
    query_pos(pos, target)
222
1
}
223

            
224
1
pub(crate) fn get_pos_comment(pos: &Vec<PredicateObject>) -> Option<String> {
225
1
    let target = "http://www.w3.org/2000/01/rdf-schema#comment";
226
1
    query_pos(pos, target)
227
1
}
228

            
229
2
pub(crate) fn query_pos(pos: &Vec<PredicateObject>, target: &str) -> Option<String> {
230
8
    let find_target_po = pos.into_iter().find(|po| po.predicate.as_str() == target);
231
2
    let Some(target_po) = find_target_po else {
232
2
        return None;
233
    };
234
    let Some(target_object) = target_po.object_literal else {
235
        return None;
236
    };
237

            
238
    Some(target_object.to_string())
239
2
}