1
use git2::Repository;
2
use nonblock::NonBlockingReader;
3
use rayon::iter::{IntoParallelIterator, ParallelIterator};
4
use serde::Serialize;
5
use std::{
6
    collections::BTreeSet,
7
    fs::remove_dir_all,
8
    io::Read,
9
    path::{Path, PathBuf},
10
    process::{Command, ExitStatus, Stdio},
11
    sync::atomic::{self, AtomicUsize},
12
    thread::sleep,
13
    time::{Duration, Instant},
14
};
15
use url::Url;
16

            
17
2
#[derive(Serialize)]
18
pub enum GitResultState {
19
    CannotConvertUrlToDirectory,
20
    GitFailed { error: String },
21
    CloningFailed { stdout: String, stderr: String },
22
    FetchFailed { stdout: String, stderr: String },
23
    InvalidCheckout { error: String },
24
    ValidCheckout,
25
}
26

            
27
2
#[derive(Serialize)]
28
pub struct GitResult {
29
    pub state: GitResultState,
30
    pub url: String,
31
}
32

            
33
/// Clone git repositories from URLs listed in the input file.
34
20
pub fn harvest(input: &Path, output_dir: &Path) -> Vec<GitResult> {
35
20
    let urls = read_urls(input);
36
20
    let n = urls.len();
37
20
    let counter = AtomicUsize::new(0);
38
20
    urls.into_par_iter()
39
20
        .map(|url| {
40
10
            let idx = counter.fetch_add(1, atomic::Ordering::SeqCst) + 1;
41
10
            println!("{}/{} Cloning {}", idx, n, url);
42
10
            clone(output_dir, url)
43
20
        })
44
20
        .collect()
45
20
}
46

            
47
// Read a file and try to interpret each line as a url.
48
// Return a set with the valid urls.
49
// If a line is not valid, try again with a https:// prefix
50
20
fn read_urls(input: &Path) -> BTreeSet<Url> {
51
20
    let txt = std::fs::read_to_string(input).unwrap();
52
20
    let mut urls = BTreeSet::new();
53
20
    for line in txt.lines() {
54
10
        if let Ok(url) = Url::parse(line) {
55
10
            urls.insert(url);
56
10
        } else if let Ok(url) = Url::parse(&format!("https://{line}")) {
57
            urls.insert(url);
58
        }
59
    }
60
20
    urls
61
20
}
62

            
63
/// Create a directory path from a url. The directory path is created
64
/// from the URL by taking scheme, domain and path from the URL.
65
///   <https://example.org/group/project>
66
/// becomes
67
///   https/exmple..org/group/project
68
fn dir_from_url(base_dir: &Path, url: &Url) -> Option<PathBuf> {
69
10
    if let Some(domain) = url.domain() {
70
10
        let mut dir = base_dir.join(url.scheme());
71
10
        dir = dir.join(domain);
72
10
        let path = url.path();
73
10
        if let Some(path) = path.strip_prefix('/') {
74
10
            dir = dir.join(path);
75
10
        } else if !path.is_empty() {
76
            dir = dir.join(path);
77
        } else {
78
            return None;
79
        }
80
10
        return Some(dir);
81
    }
82
    None
83
10
}
84

            
85
struct GitOutput {
86
    status: ExitStatus,
87
    stdout: Vec<u8>,
88
    stderr: Vec<u8>,
89
}
90

            
91
/// run a process, but abort it when there is a period of inactivity
92
20
fn run_with_timeout(cmd: &mut Command, timeout: Duration) -> std::io::Result<GitOutput> {
93
20
    let mut child = cmd
94
20
        .stdin(Stdio::null())
95
20
        .stdout(Stdio::piped())
96
20
        .stderr(Stdio::piped())
97
20
        .spawn()?;
98
20
    let mut now = Instant::now();
99
20
    let out = child.stdout.take().expect("Failed to take stdout.");
100
20
    let mut out = NonBlockingReader::from_fd(out)?;
101
20
    let err = child.stderr.take().expect("Failed to take stderr.");
102
20
    let mut err = NonBlockingReader::from_fd(err)?;
103
20
    let mut stdout = Vec::with_capacity(1024);
104
20
    let mut stderr = Vec::with_capacity(1024);
105
20
    let status = loop {
106
810
        match child.try_wait()? {
107
            None => {
108
                // when new data can be read, reset the clock
109
790
                if out.read_available(&mut stdout)? > 0 || err.read_available(&mut stderr)? > 0 {
110
10
                    now = Instant::now();
111
780
                }
112
790
                if now.elapsed() > timeout {
113
                    child.kill()?;
114
                    child.wait()?;
115
                    return Err(std::io::ErrorKind::TimedOut.into());
116
790
                }
117
790
                sleep(Duration::from_millis(100));
118
            }
119
20
            Some(exit_status) => break exit_status,
120
20
        }
121
20
    };
122
20
    out.into_blocking()?.read_to_end(&mut stdout)?;
123
20
    err.into_blocking()?.read_to_end(&mut stderr)?;
124
20
    Ok(GitOutput {
125
20
        status,
126
20
        stdout,
127
20
        stderr,
128
20
    })
129
20
}
130

            
131
/// Try to clone a git repository from the given URL into
132
/// the given directory.
133
/// If the directory already exists, it is checked.
134
10
fn clone(base_dir: &Path, url: Url) -> GitResult {
135
10
    let dir = if let Some(dir) = dir_from_url(base_dir, &url) {
136
10
        dir
137
    } else {
138
        return GitResult {
139
            state: GitResultState::CannotConvertUrlToDirectory,
140
            url: url.into(),
141
        };
142
    };
143
10
    let git_dir = dir.join(".git");
144
10
    if git_dir.exists() {
145
        return check_repository(&dir, &url);
146
10
    }
147
10
    let dir_str: String = dir.to_string_lossy().to_string();
148
    // Clone with git executable because cloning with the git2 crate
149
    // gives segfaults on some urls.
150
10
    let output = match run_with_timeout(
151
10
        Command::new("git")
152
10
            .arg("clone")
153
10
            .arg(url.as_str())
154
10
            .arg(&dir_str)
155
10
            .env("GIT_TERMINAL_PROMPT", "0"),
156
10
        Duration::from_secs(30),
157
10
    ) {
158
10
        Ok(output) => output,
159
        Err(err) => {
160
            let _ = remove_dir_all(dir);
161
            return GitResult {
162
                state: GitResultState::GitFailed {
163
                    error: err.to_string(),
164
                },
165
                url: url.into(),
166
            }
167
        }
168
    };
169
10
    if !output.status.success() {
170
        return GitResult {
171
            state: GitResultState::CloningFailed {
172
                stdout: String::from_utf8_lossy(&output.stdout).into(),
173
                stderr: String::from_utf8_lossy(&output.stderr).into(),
174
            },
175
            url: url.into(),
176
        };
177
10
    }
178
10
    let output = match run_with_timeout(
179
10
        Command::new("git")
180
10
            .arg("fetch")
181
10
            .arg("--tags")
182
10
            .current_dir(&dir),
183
10
        Duration::from_secs(30),
184
10
    ) {
185
10
        Ok(output) => output,
186
        Err(err) => {
187
            return GitResult {
188
                state: GitResultState::GitFailed {
189
                    error: err.to_string(),
190
                },
191
                url: url.into(),
192
            }
193
        }
194
    };
195
10
    if !output.status.success() {
196
        // could not fetch tags
197
        return GitResult {
198
            state: GitResultState::FetchFailed {
199
                stdout: String::from_utf8_lossy(&output.stdout).into(),
200
                stderr: String::from_utf8_lossy(&output.stderr).into(),
201
            },
202
            url: url.into(),
203
        };
204
10
    }
205
10
    check_repository(&dir, &url)
206
10
}
207

            
208
/// Check the repository in the directory by opening it
209
/// If the repository cannot be opened, the directory is
210
/// removed.
211
10
fn check_repository(dir: &Path, url: &Url) -> GitResult {
212
10
    let repo = match Repository::open(dir) {
213
10
        Ok(repo) => repo,
214
        Err(e) => {
215
            let _ = remove_dir_all(dir);
216
            return GitResult {
217
                state: GitResultState::InvalidCheckout {
218
                    error: e.to_string(),
219
                },
220
                url: url.to_string(),
221
            };
222
        }
223
    };
224
10
    let _head = match repo.head() {
225
10
        Ok(head) => head,
226
        Err(e) => {
227
            let _ = remove_dir_all(dir);
228
            return GitResult {
229
                state: GitResultState::InvalidCheckout {
230
                    error: e.to_string(),
231
                },
232
                url: url.to_string(),
233
            };
234
        }
235
    };
236
10
    GitResult {
237
10
        state: GitResultState::ValidCheckout,
238
10
        url: url.to_string(),
239
10
    }
240
10
}