Lines
84.95 %
Functions
72.22 %
use crate::{index::index_path::IndexPath, server_error::MyError, urn::blob_to_uri};
use chrono::{DateTime, Utc};
use serde::Serialize;
use sophia::{
api::{
dataset::{self, MutableDataset},
graph::{CollectibleGraph, Graph},
parser::TripleParser,
term::{matcher::Any, IriRef, SimpleTerm},
triple::Triple,
MownStr,
},
inmem::{dataset::FastDataset, graph::LightGraph},
iri::Iri,
turtle::parser::{nt::NTriplesParser, turtle::TurtleParser},
xml::parser::RdfXmlParser,
};
use std::{collections::BTreeSet, error::Error, path::Path};
pub(crate) struct Dataset {
pub id: String,
pub versions: Vec<DatasetVersion>,
}
impl Dataset {
pub fn latest_version(&self) -> Option<DateTime<Utc>> {
self.versions.iter().map(|v| &v.date).max().copied()
pub(crate) struct DatasetVersion {
pub date: DateTime<Utc>,
pub data: FastDataset,
#[derive(Serialize)]
pub(crate) struct PredicateObject<'a> {
predicate: &'a IriRef<MownStr<'a>>,
object_iri: Option<&'a IriRef<MownStr<'a>>>,
object_literal: Option<&'a str>,
impl DatasetVersion {
pub fn number_of_quads(&self) -> usize {
sophia::api::prelude::Dataset::quads(&self.data).count()
pub fn subjects(&self) -> BTreeSet<&IriRef<MownStr>> {
use dataset::Dataset;
let subjects: BTreeSet<_> = self
.data
.subjects()
.filter_map(|t| match t {
Ok(SimpleTerm::Iri(iri)) => Some(iri),
_ => None,
})
.collect();
subjects
// return predicate and object for the given subject
pub fn pos(&self, subject: &str) -> Vec<PredicateObject> {
let subject = IriRef::new(subject.to_string()).unwrap();
let pos: Vec<_> = self
.quads_matching([subject], Any, Any, Any)
.filter_map(|t| t.ok())
.map(|t| (t.1[1], t.1[2]))
.filter_map(|(p, o)| match p {
SimpleTerm::Iri(p) => Some(PredicateObject {
predicate: p,
object_iri: match o {
SimpleTerm::Iri(o) => Some(o),
object_literal: match o {
SimpleTerm::LiteralDatatype(value, _) => Some(value),
SimpleTerm::LiteralLanguage(value, _) => Some(value),
}),
pos
/**
* Read each directory under `dir` as a dataset.
*/
pub(crate) fn load_datasets<P: AsRef<Path>>(dir: P) -> Result<Vec<Dataset>, Box<dyn Error>> {
let paths = std::fs::read_dir(dir)?;
let mut datasets = Vec::new();
for path in paths.filter_map(|p| p.ok()) {
// skip entries that are not directories
if !path.file_type().map(|ft| ft.is_dir()).unwrap_or_default() {
continue;
datasets.push(Dataset {
id: path.file_name().to_string_lossy().to_string(),
versions: load_dataset_versions(&path.path())?,
});
Ok(datasets)
* Read each directory under `dir` as a version of the dataset.
fn load_dataset_versions(dir: &Path) -> Result<Vec<DatasetVersion>, Box<dyn Error>> {
let mut versions = Vec::new();
for path in std::fs::read_dir(dir)?.filter_map(|p| p.ok()) {
if let Ok(index_path) = IndexPath::try_from(path.path()) {
versions.push(DatasetVersion {
date: index_path.date(),
data: load_dataset_version(&index_path)?,
versions.sort_unstable_by(|a, b| b.date.cmp(&a.date));
Ok(versions)
* Parse a string into a graph with the given base.
* The graph can be in either Turtle (ttl), RDF/XML or NTriples format.
pub(crate) fn parse_graph(rdf: &str, base: Option<Iri<String>>) -> Result<LightGraph, MyError> {
let parser = TurtleParser { base: base.clone() };
let source = parser.parse_str(rdf);
if let Ok(graph) = LightGraph::from_triple_source(source) {
return Ok(graph);
let parser = RdfXmlParser { base };
let parser = NTriplesParser {};
Ok(LightGraph::from_triple_source(source)?)
* Load the dataset for a version.
* This is done by reading all the files in the directory and inserting them
* in a [`dataset::Dataset`].
fn load_dataset_version(dir: &IndexPath) -> Result<FastDataset, Box<dyn Error>> {
let mut dataset = FastDataset::new();
for path in dir.file_paths()? {
let rdf = std::fs::read_to_string(path)?;
let urn = blob_to_uri(rdf.as_bytes());
let base_ref = IriRef::new(urn.clone())?;
let base: Iri<String> = Iri::new(urn)?;
let graph = parse_graph(&rdf, Some(base))?;
for t in graph.triples() {
let t = t?;
dataset.insert(t.s(), t.p(), t.o(), Some(base_ref.clone()))?;
Ok(dataset)