Grcov report - dataset.rs

1

use crate::{index::index_path::IndexPath, server_error::MyError, urn::blob_to_uri};

2

use chrono::{DateTime, Utc};

3

use lazy_static::lazy_static;

4

use regex::Regex;

5

use serde::Serialize;

6

use sophia::{

7

    api::{

8

        dataset::{self, MutableDataset},

9

        graph::{CollectibleGraph, Graph},

10

        parser::TripleParser,

11

        prefix::{Prefix, PrefixMapPair},

12

        term::{matcher::Any, IriRef, SimpleTerm},

13

        triple::Triple,

14

        MownStr,

15

},

16

    inmem::{dataset::FastDataset, graph::LightGraph},

17

    iri::Iri,

18

    turtle::parser::{nt::NTriplesParser, turtle::TurtleParser},

19

    xml::parser::RdfXmlParser,

20

};

21

use std::{

22

    collections::{BTreeSet, HashMap},

23

    error::Error,

24

    path::Path,

25

};

26

27

pub(crate) struct Dataset {

28

    pub id: String,

29

    pub versions: Vec<DatasetVersion>,

30

31

32

impl Dataset {

33

6

    pub fn latest_version(&self) -> Option<DateTime<Utc>> {

34

8

        self.versions.iter().map(|v| &v.date).max().copied()

35

6

36

37

38

pub(crate) struct DatasetVersion {

39

    pub date: DateTime<Utc>,

40

    pub data: FastDataset,

41

42

43

#[derive(Serialize)]

44

pub(crate) struct PredicateObject<'a> {

45

    pub predicate: &'a IriRef<MownStr<'a>>,

46

    pub object_iri: Option<&'a IriRef<MownStr<'a>>>,

47

    pub object_literal: Option<&'a str>,

48

49

50

impl DatasetVersion {

51

4

    pub fn number_of_quads(&self) -> usize {

52

4

        sophia::api::prelude::Dataset::quads(&self.data).count()

53

4

54

1

    pub fn subjects(&self) -> BTreeSet<&IriRef<MownStr>> {

55

1

        use dataset::Dataset;

56

1

        let subjects: BTreeSet<_> = self

57

1

            .data

58

1

            .subjects()

59

666

            .filter_map(|t| match t {

60

666

                Ok(SimpleTerm::Iri(iri)) => Some(iri),

61

                _ => None,

62

666

})

63

1

            .collect();

64

1

        subjects

65

1

66

    // return predicate and object for the given subject

67

124

    pub fn pos(&self, subject: &str) -> Vec<PredicateObject> {

68

124

        use dataset::Dataset;

69

124

        let subject = IriRef::new(subject.to_string()).unwrap();

70

124

        let pos: Vec<_> = self

71

124

            .data

72

124

            .quads_matching([subject], Any, Any, Any)

73

681

            .filter_map(|t| t.ok())

74

681

            .map(|t| (t.1[1], t.1[2]))

75

681

            .filter_map(|(p, o)| match p {

76

681

                SimpleTerm::Iri(p) => Some(PredicateObject {

77

681

                    predicate: p,

78

681

                    object_iri: match o {

79

438

                        SimpleTerm::Iri(o) => Some(o),

80

243

                        _ => None,

81

},

82

681

                    object_literal: match o {

83

243

                        SimpleTerm::LiteralDatatype(value, _) => Some(value),

84

                        SimpleTerm::LiteralLanguage(value, _) => Some(value),

85

438

                        _ => None,

86

},

87

}),

88

                _ => None,

89

681

})

90

124

            .collect();

91

124

pos

92

124

93

94

95

/**

96

 * Read each directory under `dir` as a dataset.

97

*/

98

18

pub(crate) fn load_datasets<P: AsRef<Path>>(dir: P) -> Result<Vec<Dataset>, Box<dyn Error>> {

99

18

    let paths = std::fs::read_dir(dir)?;

100

18

    let mut datasets = Vec::new();

101

36

    for path in paths.filter_map(|p| p.ok()) {

102

        // skip entries that are not directories

103

36

        if !path.file_type().map(|ft| ft.is_dir()).unwrap_or_default() {

104

            continue;

105

36

106

36

        datasets.push(Dataset {

107

36

            id: path.file_name().to_string_lossy().to_string(),

108

36

            versions: load_dataset_versions(&path.path())?,

109

});

110

111

18

    Ok(datasets)

112

18

113

114

/**

115

 * Read each directory under `dir` as a version of the dataset.

116

*/

117

36

fn load_dataset_versions(dir: &Path) -> Result<Vec<DatasetVersion>, Box<dyn Error>> {

118

36

    let mut versions = Vec::new();

119

47

    for path in std::fs::read_dir(dir)?.filter_map(|p| p.ok()) {

120

47

        if let Ok(index_path) = IndexPath::try_from(path.path()) {

121

46

            versions.push(DatasetVersion {

122

46

                date: index_path.date(),

123

46

                data: load_dataset_version(&index_path)?,

124

})

125

1

126

127

36

    versions.sort_unstable_by(|a, b| b.date.cmp(&a.date));

128

36

    Ok(versions)

129

36

130

131

/**

132

 * Parse a string into a graph with the given base.

133

 * The graph can be in either Turtle (ttl), RDF/XML or NTriples format.

134

*/

135

189

pub(crate) fn parse_graph(rdf: &str, base: Option<Iri<String>>) -> Result<LightGraph, MyError> {

136

189

    let parser = TurtleParser { base: base.clone() };

137

189

    let source = parser.parse_str(rdf);

138

189

    if let Ok(graph) = LightGraph::from_triple_source(source) {

139

189

        return Ok(graph);

140

141

    let parser = RdfXmlParser { base };

142

    let source = parser.parse_str(rdf);

143

    if let Ok(graph) = LightGraph::from_triple_source(source) {

144

        return Ok(graph);

145

146

    let parser = NTriplesParser {};

147

    let source = parser.parse_str(rdf);

148

    Ok(LightGraph::from_triple_source(source)?)

149

189

150

151

/**

152

 * Load the dataset for a version.

153

 * This is done by reading all the files in the directory and inserting them

154

 * in a [`dataset::Dataset`].

155

*/

156

46

fn load_dataset_version(dir: &IndexPath) -> Result<FastDataset, Box<dyn Error>> {

157

46

    let mut dataset = FastDataset::new();

158

184

    for path in dir.file_paths()? {

159

184

        let rdf = std::fs::read_to_string(path)?;

160

184

        let urn = blob_to_uri(rdf.as_bytes());

161

184

        let base_ref = IriRef::new(urn.clone())?;

162

184

        let base: Iri<String> = Iri::new(urn)?;

163

184

        let graph = parse_graph(&rdf, Some(base))?;

164

30636

        for t in graph.triples() {

165

30636

            let t = t?;

166

30636

            dataset.insert(t.s(), t.p(), t.o(), Some(base_ref.clone()))?;

167

168

169

46

    Ok(dataset)

170

46

171

172

lazy_static! {

173

    static ref PREFIX_RE: Regex = Regex::new(r"@prefix ([a-zA-Z0-9]+):\s+<(.+)>\s\.").unwrap();

174

175

176

1

pub(crate) fn load_prefixes<P: AsRef<Path>>(dir: P) -> Result<Vec<PrefixMapPair>, Box<dyn Error>> {

177

1

    let mut prefixes: HashMap<String, String> = HashMap::new();

178

179

1

    let paths = std::fs::read_dir(dir)?;

180

    // Dataset directories

181

2

    for path in paths.filter_map(|p| p.ok()) {

182

2

        if !path.file_type().map(|ft| ft.is_dir()).unwrap_or_default() {

183

            continue;

184

2

185

        // Dataset version directories

186

3

        for path in std::fs::read_dir(&path.path())?.filter_map(|p| p.ok()) {

187

3

            if let Ok(index_path) = IndexPath::try_from(path.path()) {

188

                // RDF files

189

8

                for path in index_path.file_paths()? {

190

8

                    let rdf = std::fs::read_to_string(path)?;

191

192

                    // Find prefixes in RDF files

193

30

                    for (_, [prefix, url]) in PREFIX_RE.captures_iter(&rdf).map(|c| c.extract()) {

194

30

                        match prefixes.insert(prefix.to_string(), url.to_string()) {

195

7

                            None => (),

196

23

                            Some(old_url) => {

197

23

                                if url != old_url {

198

                                    // TODO Make this nicer

199

                                    panic!("Prefix \"{prefix}\" was defined twice with different values");

200

23

201

202

203

204

205

1

206

207

208

1

    Ok(prefixes

209

1

        .into_iter()

210

7

        .map(|(prefix, url)| {

211

7

212

7

                Prefix::new_unchecked(prefix.into()),

213

7

                Iri::new_unchecked(url.into()),

214

7

215

7

})

216

1

        .collect())

217

1

218

219

1

pub(crate) fn get_pos_label<'a>(pos: &'a Vec<PredicateObject<'a>>) -> Option<&str> {

220

1

    let target = "http://www.w3.org/2000/01/rdf-schema#label";

221

1

    find_predicate_object_literals_for_subject(pos, target)

222

1

        .first()

223

1

        .copied()

224

1

225

226

1

pub(crate) fn get_pos_comment<'a>(pos: &'a Vec<PredicateObject<'a>>) -> Option<&str> {

227

1

    let target = "http://www.w3.org/2000/01/rdf-schema#comment";

228

1

    find_predicate_object_literals_for_subject(pos, target)

229

1

        .first()

230

1

        .copied()

231

1

232

233

2

pub(crate) fn find_predicate_object_literals_for_subject<'a>(

234

2

    pos: &'a Vec<PredicateObject<'a>>,

235

2

    target: &'a str,

236

2

) -> Vec<&'a str> {

237

2

    find_predicate_objects_for_subject(pos, target)

238

2

        .into_iter()

239

2

        .filter(|po| po.object_literal.is_some())

240

2

        .map(|po| po.object_literal.unwrap_or_default())

241

2

        .collect()

242

2

243

244

2

pub(crate) fn find_predicate_objects_for_subject<'a>(

245

2

    pos: &'a Vec<PredicateObject<'a>>,

246

2

    target: &'a str,

247

2

) -> Vec<&'a PredicateObject<'a>> {

248

2

    pos.into_iter()

249

8

        .filter(|po| po.predicate.as_str() == target)

250

2

        .collect()

251

2