1
use crate::analyze::{analyze_repository, RepositoryAnalysis};
2
use git2::Repository;
3
use nonblock::NonBlockingReader;
4
use rayon::iter::{IntoParallelIterator, ParallelIterator};
5
use sequoia_openpgp::Cert;
6
use serde::Serialize;
7
use std::{
8
    collections::BTreeSet,
9
    fs::remove_dir_all,
10
    io::Read,
11
    path::{Path, PathBuf},
12
    process::{Command, ExitStatus, Stdio},
13
    sync::atomic::{self, AtomicUsize},
14
    thread::sleep,
15
    time::{Duration, Instant},
16
};
17
use url::Url;
18

            
19
2
#[derive(Serialize, PartialEq)]
20
pub enum GitResultState {
21
    CannotConvertUrlToDirectory,
22
    GitFailed { error: String },
23
    CloningFailed { stdout: String, stderr: String },
24
    FetchFailed { stdout: String, stderr: String },
25
    InvalidCheckout { error: String },
26
    ValidCheckout { analysis: RepositoryAnalysis },
27
}
28

            
29
2
#[derive(Serialize)]
30
pub struct GitResult {
31
    pub state: GitResultState,
32
    pub url: String,
33
}
34

            
35
/// Clone git repositories from URLs listed in the input file.
36
20
pub fn harvest(input: &Path, output_dir: &Path, certs: &[Cert]) -> Vec<GitResult> {
37
20
    let urls = read_urls(input);
38
20
    let n = urls.len();
39
20
    let counter = AtomicUsize::new(0);
40
20
    urls.into_par_iter()
41
20
        .map(|url| {
42
10
            let idx = counter.fetch_add(1, atomic::Ordering::SeqCst) + 1;
43
10
            println!("{}/{} Cloning {}", idx, n, url);
44
10
            clone(output_dir, url, certs)
45
20
        })
46
20
        .collect()
47
20
}
48

            
49
// Read a file and try to interpret each line as a url.
50
// Return a set with the valid urls.
51
// If a line is not valid, try again with a https:// prefix
52
20
fn read_urls(input: &Path) -> BTreeSet<Url> {
53
20
    let txt = std::fs::read_to_string(input).unwrap();
54
20
    let mut urls = BTreeSet::new();
55
20
    for line in txt.lines() {
56
10
        if let Ok(url) = Url::parse(line) {
57
10
            urls.insert(url);
58
10
        } else if let Ok(url) = Url::parse(&format!("https://{line}")) {
59
            urls.insert(url);
60
        }
61
    }
62
20
    urls
63
20
}
64

            
65
/// Create a directory path from a url. The directory path is created
66
/// from the URL by taking scheme, domain and path from the URL.
67
///   <https://example.org/group/project>
68
/// becomes
69
///   https/exmple..org/group/project
70
pub fn dir_from_url(base_dir: &Path, url: &Url) -> Option<PathBuf> {
71
10
    if let Some(domain) = url.domain() {
72
10
        let mut dir = base_dir.join(url.scheme());
73
10
        dir = dir.join(domain);
74
10
        let path = url.path();
75
10
        if let Some(path) = path.strip_prefix('/') {
76
10
            dir = dir.join(path);
77
10
        } else if !path.is_empty() {
78
            dir = dir.join(path);
79
        } else {
80
            return None;
81
        }
82
10
        return Some(dir);
83
    }
84
    None
85
10
}
86

            
87
struct GitOutput {
88
    status: ExitStatus,
89
    stdout: Vec<u8>,
90
    stderr: Vec<u8>,
91
}
92

            
93
/// run a process, but abort it when there is a period of inactivity
94
20
fn run_with_timeout(cmd: &mut Command, timeout: Duration) -> std::io::Result<GitOutput> {
95
20
    let mut child = cmd
96
20
        .stdin(Stdio::null())
97
20
        .stdout(Stdio::piped())
98
20
        .stderr(Stdio::piped())
99
20
        .spawn()?;
100
20
    let mut now = Instant::now();
101
20
    let out = child.stdout.take().expect("Failed to take stdout.");
102
20
    let mut out = NonBlockingReader::from_fd(out)?;
103
20
    let err = child.stderr.take().expect("Failed to take stderr.");
104
20
    let mut err = NonBlockingReader::from_fd(err)?;
105
20
    let mut stdout = Vec::with_capacity(1024);
106
20
    let mut stderr = Vec::with_capacity(1024);
107
20
    let status = loop {
108
50
        match child.try_wait()? {
109
            None => {
110
                // when new data can be read, reset the clock
111
30
                if out.read_available(&mut stdout)? > 0 || err.read_available(&mut stderr)? > 0 {
112
10
                    now = Instant::now();
113
20
                }
114
30
                if now.elapsed() > timeout {
115
                    child.kill()?;
116
                    child.wait()?;
117
                    return Err(std::io::ErrorKind::TimedOut.into());
118
30
                }
119
30
                sleep(Duration::from_millis(100));
120
            }
121
20
            Some(exit_status) => break exit_status,
122
20
        }
123
20
    };
124
20
    out.into_blocking()?.read_to_end(&mut stdout)?;
125
20
    err.into_blocking()?.read_to_end(&mut stderr)?;
126
20
    Ok(GitOutput {
127
20
        status,
128
20
        stdout,
129
20
        stderr,
130
20
    })
131
20
}
132

            
133
/// Try to clone a git repository from the given URL into
134
/// the given directory.
135
/// If the directory already exists, it is checked.
136
10
fn clone(base_dir: &Path, url: Url, certs: &[Cert]) -> GitResult {
137
10
    let dir = if let Some(dir) = dir_from_url(base_dir, &url) {
138
10
        dir
139
    } else {
140
        return GitResult {
141
            state: GitResultState::CannotConvertUrlToDirectory,
142
            url: url.into(),
143
        };
144
    };
145
10
    let git_dir = dir.join(".git");
146
10
    if git_dir.exists() {
147
        return check_repository(&dir, &url, certs);
148
10
    }
149
10
    let dir_str: String = dir.to_string_lossy().to_string();
150
    // Clone with git executable because cloning with the git2 crate
151
    // gives segfaults on some urls.
152
10
    let output = match run_with_timeout(
153
10
        Command::new("git")
154
10
            .arg("clone")
155
10
            .arg(url.as_str())
156
10
            .arg(&dir_str)
157
10
            .env("GIT_TERMINAL_PROMPT", "0"),
158
10
        Duration::from_secs(30),
159
10
    ) {
160
10
        Ok(output) => output,
161
        Err(err) => {
162
            let _ = remove_dir_all(dir);
163
            return GitResult {
164
                state: GitResultState::GitFailed {
165
                    error: err.to_string(),
166
                },
167
                url: url.into(),
168
            };
169
        }
170
    };
171
10
    if !output.status.success() {
172
        return GitResult {
173
            state: GitResultState::CloningFailed {
174
                stdout: String::from_utf8_lossy(&output.stdout).into(),
175
                stderr: String::from_utf8_lossy(&output.stderr).into(),
176
            },
177
            url: url.into(),
178
        };
179
10
    }
180
10
    let output = match run_with_timeout(
181
10
        Command::new("git")
182
10
            .arg("fetch")
183
10
            .arg("--tags")
184
10
            .current_dir(&dir),
185
10
        Duration::from_secs(30),
186
10
    ) {
187
10
        Ok(output) => output,
188
        Err(err) => {
189
            return GitResult {
190
                state: GitResultState::GitFailed {
191
                    error: err.to_string(),
192
                },
193
                url: url.into(),
194
            }
195
        }
196
    };
197
10
    if !output.status.success() {
198
        // could not fetch tags
199
        return GitResult {
200
            state: GitResultState::FetchFailed {
201
                stdout: String::from_utf8_lossy(&output.stdout).into(),
202
                stderr: String::from_utf8_lossy(&output.stderr).into(),
203
            },
204
            url: url.into(),
205
        };
206
10
    }
207
10
    check_repository(&dir, &url, certs)
208
10
}
209

            
210
/// Check the repository in the directory by opening it
211
/// If the repository cannot be opened, the directory is
212
/// removed.
213
10
fn check_repository(dir: &Path, url: &Url, certs: &[Cert]) -> GitResult {
214
10
    let repo = match Repository::open(dir) {
215
10
        Ok(repo) => repo,
216
        Err(e) => {
217
            let _ = remove_dir_all(dir);
218
            return GitResult {
219
                state: GitResultState::InvalidCheckout {
220
                    error: e.to_string(),
221
                },
222
                url: url.to_string(),
223
            };
224
        }
225
    };
226
10
    let _head = match repo.head() {
227
10
        Ok(head) => head,
228
        Err(e) => {
229
            let _ = remove_dir_all(dir);
230
            return GitResult {
231
                state: GitResultState::InvalidCheckout {
232
                    error: e.to_string(),
233
                },
234
                url: url.to_string(),
235
            };
236
        }
237
    };
238
10
    let analysis = match analyze_repository(certs, &repo) {
239
10
        Ok(analysis) => analysis,
240
        Err(e) => {
241
            let _ = remove_dir_all(dir);
242
            return GitResult {
243
                state: GitResultState::InvalidCheckout {
244
                    error: e.to_string(),
245
                },
246
                url: url.to_string(),
247
            };
248
        }
249
    };
250
10
    GitResult {
251
10
        state: GitResultState::ValidCheckout { analysis },
252
10
        url: url.to_string(),
253
10
    }
254
10
}