Lines
89.47 %
Functions
58.06 %
use crate::{index::index_path::IndexPath, server_error::MyError, urn::blob_to_uri};
use chrono::{DateTime, Utc};
use lazy_static::lazy_static;
use regex::Regex;
use serde::Serialize;
use sophia::{
api::{
dataset::{self, MutableDataset},
graph::{CollectibleGraph, Graph},
parser::TripleParser,
prefix::{Prefix, PrefixMapPair},
term::{matcher::Any, IriRef, SimpleTerm},
triple::Triple,
MownStr,
},
inmem::{dataset::FastDataset, graph::LightGraph},
iri::Iri,
turtle::parser::{nt::NTriplesParser, turtle::TurtleParser},
xml::parser::RdfXmlParser,
};
use std::{
collections::{BTreeSet, HashMap},
error::Error,
path::Path,
pub(crate) struct Dataset {
pub id: String,
pub versions: Vec<DatasetVersion>,
}
impl Dataset {
pub fn latest_version(&self) -> Option<DateTime<Utc>> {
self.versions.iter().map(|v| &v.date).max().copied()
pub(crate) struct DatasetVersion {
pub date: DateTime<Utc>,
pub data: FastDataset,
#[derive(Serialize)]
pub(crate) struct PredicateObject<'a> {
pub predicate: &'a IriRef<MownStr<'a>>,
pub object_iri: Option<&'a IriRef<MownStr<'a>>>,
pub object_literal: Option<&'a str>,
impl DatasetVersion {
pub fn number_of_quads(&self) -> usize {
sophia::api::prelude::Dataset::quads(&self.data).count()
pub fn subjects(&self) -> BTreeSet<&IriRef<MownStr>> {
use dataset::Dataset;
let subjects: BTreeSet<_> = self
.data
.subjects()
.filter_map(|t| match t {
Ok(SimpleTerm::Iri(iri)) => Some(iri),
_ => None,
})
.collect();
subjects
// return predicate and object for the given subject
pub fn pos(&self, subject: &str) -> Vec<PredicateObject> {
let subject = IriRef::new(subject.to_string()).unwrap();
let pos: Vec<_> = self
.quads_matching([subject], Any, Any, Any)
.filter_map(|t| t.ok())
.map(|t| (t.1[1], t.1[2]))
.filter_map(|(p, o)| match p {
SimpleTerm::Iri(p) => Some(PredicateObject {
predicate: p,
object_iri: match o {
SimpleTerm::Iri(o) => Some(o),
object_literal: match o {
SimpleTerm::LiteralDatatype(value, _) => Some(value),
SimpleTerm::LiteralLanguage(value, _) => Some(value),
}),
pos
/**
* Read each directory under `dir` as a dataset.
*/
pub(crate) fn load_datasets<P: AsRef<Path>>(dir: P) -> Result<Vec<Dataset>, Box<dyn Error>> {
let paths = std::fs::read_dir(dir)?;
let mut datasets = Vec::new();
for path in paths.filter_map(|p| p.ok()) {
// skip entries that are not directories
if !path.file_type().map(|ft| ft.is_dir()).unwrap_or_default() {
continue;
datasets.push(Dataset {
id: path.file_name().to_string_lossy().to_string(),
versions: load_dataset_versions(&path.path())?,
});
Ok(datasets)
* Read each directory under `dir` as a version of the dataset.
fn load_dataset_versions(dir: &Path) -> Result<Vec<DatasetVersion>, Box<dyn Error>> {
let mut versions = Vec::new();
for path in std::fs::read_dir(dir)?.filter_map(|p| p.ok()) {
if let Ok(index_path) = IndexPath::try_from(path.path()) {
versions.push(DatasetVersion {
date: index_path.date(),
data: load_dataset_version(&index_path)?,
versions.sort_unstable_by(|a, b| b.date.cmp(&a.date));
Ok(versions)
* Parse a string into a graph with the given base.
* The graph can be in either Turtle (ttl), RDF/XML or NTriples format.
pub(crate) fn parse_graph(rdf: &str, base: Option<Iri<String>>) -> Result<LightGraph, MyError> {
let parser = TurtleParser { base: base.clone() };
let source = parser.parse_str(rdf);
if let Ok(graph) = LightGraph::from_triple_source(source) {
return Ok(graph);
let parser = RdfXmlParser { base };
let parser = NTriplesParser {};
Ok(LightGraph::from_triple_source(source)?)
* Load the dataset for a version.
* This is done by reading all the files in the directory and inserting them
* in a [`dataset::Dataset`].
fn load_dataset_version(dir: &IndexPath) -> Result<FastDataset, Box<dyn Error>> {
let mut dataset = FastDataset::new();
for path in dir.file_paths()? {
let rdf = std::fs::read_to_string(path)?;
let urn = blob_to_uri(rdf.as_bytes());
let base_ref = IriRef::new(urn.clone())?;
let base: Iri<String> = Iri::new(urn)?;
let graph = parse_graph(&rdf, Some(base))?;
for t in graph.triples() {
let t = t?;
dataset.insert(t.s(), t.p(), t.o(), Some(base_ref.clone()))?;
Ok(dataset)
lazy_static! {
static ref PREFIX_RE: Regex = Regex::new(r"@prefix ([a-zA-Z0-9]+):\s+<(.+)>\s\.").unwrap();
pub(crate) fn load_prefixes<P: AsRef<Path>>(dir: P) -> Result<Vec<PrefixMapPair>, Box<dyn Error>> {
let mut prefixes: HashMap<String, String> = HashMap::new();
// Dataset directories
// Dataset version directories
for path in std::fs::read_dir(&path.path())?.filter_map(|p| p.ok()) {
// RDF files
for path in index_path.file_paths()? {
// Find prefixes in RDF files
for (_, [prefix, url]) in PREFIX_RE.captures_iter(&rdf).map(|c| c.extract()) {
match prefixes.insert(prefix.to_string(), url.to_string()) {
None => (),
Some(old_url) => {
if url != old_url {
// TODO Make this nicer
panic!("Prefix \"{prefix}\" was defined twice with different values");
Ok(prefixes
.into_iter()
.map(|(prefix, url)| {
(
Prefix::new_unchecked(prefix.into()),
Iri::new_unchecked(url.into()),
)
.collect())
pub(crate) fn get_pos_label<'a>(pos: &'a Vec<PredicateObject<'a>>) -> Option<&str> {
let target = "http://www.w3.org/2000/01/rdf-schema#label";
find_predicate_object_literals_for_subject(pos, target)
.first()
.copied()
pub(crate) fn get_pos_comment<'a>(pos: &'a Vec<PredicateObject<'a>>) -> Option<&str> {
let target = "http://www.w3.org/2000/01/rdf-schema#comment";
pub(crate) fn find_predicate_object_literals_for_subject<'a>(
pos: &'a Vec<PredicateObject<'a>>,
target: &'a str,
) -> Vec<&'a str> {
find_predicate_objects_for_subject(pos, target)
.filter(|po| po.object_literal.is_some())
.map(|po| po.object_literal.unwrap_or_default())
.collect()
pub(crate) fn find_predicate_objects_for_subject<'a>(
) -> Vec<&'a PredicateObject<'a>> {
pos.into_iter()
.filter(|po| po.predicate.as_str() == target)