Lines
60.92 %
Functions
39.13 %
use crate::analyze::{analyze_repository, RepositoryAnalysis};
use git2::Repository;
use nonblock::NonBlockingReader;
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use sequoia_openpgp::Cert;
use serde::Serialize;
use std::{
collections::BTreeSet,
fs::remove_dir_all,
io::Read,
path::{Path, PathBuf},
process::{Command, ExitStatus, Stdio},
sync::atomic::{self, AtomicUsize},
thread::sleep,
time::{Duration, Instant},
};
use url::Url;
#[derive(Serialize, PartialEq)]
pub enum GitResultState {
CannotConvertUrlToDirectory,
GitFailed { error: String },
CloningFailed { stdout: String, stderr: String },
FetchFailed { stdout: String, stderr: String },
InvalidCheckout { error: String },
ValidCheckout { analysis: RepositoryAnalysis },
}
#[derive(Serialize)]
pub struct GitResult {
pub state: GitResultState,
pub url: String,
/// Clone git repositories from URLs listed in the input file.
pub fn harvest(input: &Path, output_dir: &Path, certs: &[Cert]) -> Vec<GitResult> {
let urls = read_urls(input);
let n = urls.len();
let counter = AtomicUsize::new(0);
urls.into_par_iter()
.map(|url| {
let idx = counter.fetch_add(1, atomic::Ordering::SeqCst) + 1;
println!("{}/{} Cloning {}", idx, n, url);
clone(output_dir, url, certs)
})
.collect()
// Read a file and try to interpret each line as a url.
// Return a set with the valid urls.
// If a line is not valid, try again with a https:// prefix
fn read_urls(input: &Path) -> BTreeSet<Url> {
let txt = std::fs::read_to_string(input).unwrap();
let mut urls = BTreeSet::new();
for line in txt.lines() {
if let Ok(url) = Url::parse(line) {
urls.insert(url);
} else if let Ok(url) = Url::parse(&format!("https://{line}")) {
urls
/// Create a directory path from a url. The directory path is created
/// from the URL by taking scheme, domain and path from the URL.
/// <https://example.org/group/project>
/// becomes
/// https/exmple..org/group/project
pub fn dir_from_url(base_dir: &Path, url: &Url) -> Option<PathBuf> {
if let Some(domain) = url.domain() {
let mut dir = base_dir.join(url.scheme());
dir = dir.join(domain);
let path = url.path();
if let Some(path) = path.strip_prefix('/') {
dir = dir.join(path);
} else if !path.is_empty() {
} else {
return None;
return Some(dir);
None
struct GitOutput {
status: ExitStatus,
stdout: Vec<u8>,
stderr: Vec<u8>,
/// run a process, but abort it when there is a period of inactivity
fn run_with_timeout(cmd: &mut Command, timeout: Duration) -> std::io::Result<GitOutput> {
let mut child = cmd
.stdin(Stdio::null())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()?;
let mut now = Instant::now();
let out = child.stdout.take().expect("Failed to take stdout.");
let mut out = NonBlockingReader::from_fd(out)?;
let err = child.stderr.take().expect("Failed to take stderr.");
let mut err = NonBlockingReader::from_fd(err)?;
let mut stdout = Vec::with_capacity(1024);
let mut stderr = Vec::with_capacity(1024);
let status = loop {
match child.try_wait()? {
None => {
// when new data can be read, reset the clock
if out.read_available(&mut stdout)? > 0 || err.read_available(&mut stderr)? > 0 {
now = Instant::now();
if now.elapsed() > timeout {
child.kill()?;
child.wait()?;
return Err(std::io::ErrorKind::TimedOut.into());
sleep(Duration::from_millis(100));
Some(exit_status) => break exit_status,
out.into_blocking()?.read_to_end(&mut stdout)?;
err.into_blocking()?.read_to_end(&mut stderr)?;
Ok(GitOutput {
status,
stdout,
stderr,
/// Try to clone a git repository from the given URL into
/// the given directory.
/// If the directory already exists, it is checked.
fn clone(base_dir: &Path, url: Url, certs: &[Cert]) -> GitResult {
let dir = if let Some(dir) = dir_from_url(base_dir, &url) {
dir
return GitResult {
state: GitResultState::CannotConvertUrlToDirectory,
url: url.into(),
let git_dir = dir.join(".git");
if git_dir.exists() {
return check_repository(&dir, &url, certs);
let dir_str: String = dir.to_string_lossy().to_string();
// Clone with git executable because cloning with the git2 crate
// gives segfaults on some urls.
let output = match run_with_timeout(
Command::new("git")
.arg("clone")
.arg(url.as_str())
.arg(&dir_str)
.env("GIT_TERMINAL_PROMPT", "0"),
Duration::from_secs(30),
) {
Ok(output) => output,
Err(err) => {
let _ = remove_dir_all(dir);
state: GitResultState::GitFailed {
error: err.to_string(),
},
if !output.status.success() {
state: GitResultState::CloningFailed {
stdout: String::from_utf8_lossy(&output.stdout).into(),
stderr: String::from_utf8_lossy(&output.stderr).into(),
.arg("fetch")
.arg("--tags")
.current_dir(&dir),
// could not fetch tags
state: GitResultState::FetchFailed {
check_repository(&dir, &url, certs)
/// Check the repository in the directory by opening it
/// If the repository cannot be opened, the directory is
/// removed.
fn check_repository(dir: &Path, url: &Url, certs: &[Cert]) -> GitResult {
let repo = match Repository::open(dir) {
Ok(repo) => repo,
Err(e) => {
state: GitResultState::InvalidCheckout {
error: e.to_string(),
url: url.to_string(),
let _head = match repo.head() {
Ok(head) => head,
let analysis = match analyze_repository(certs, &repo) {
Ok(analysis) => analysis,
GitResult {
state: GitResultState::ValidCheckout { analysis },