Lines
0 %
Functions
Branches
100 %
use git2::Repository;
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use std::{
collections::BTreeSet,
fs::remove_dir_all,
path::{Path, PathBuf},
process::Command,
sync::atomic::{self, AtomicUsize},
};
use url::Url;
/// Clone git repositories from URLs listed in the input file.
pub fn harvest(input: &Path, output_dir: &Path) {
let urls = read_urls(input);
let n = urls.len();
let counter = AtomicUsize::new(0);
urls.into_par_iter().for_each(|url| {
let idx = counter.fetch_add(1, atomic::Ordering::SeqCst) + 1;
println!("{}/{} Cloning {}", idx, n, url);
clone(output_dir, url);
});
}
// Read a file and try to interpret each line as a url.
// Return a set with the valid urls.
// If a line is not valid, try again with a https:// prefix
fn read_urls(input: &Path) -> BTreeSet<Url> {
let txt = std::fs::read_to_string(input).unwrap();
let mut urls = BTreeSet::new();
for line in txt.lines() {
if let Ok(url) = Url::parse(line) {
urls.insert(url);
} else if let Ok(url) = Url::parse(&format!("https://{line}")) {
urls
/// Create a directory path from a url. The directory path is created
/// from the URL by taking scheme, domain and path from the URL.
/// https://example.org/group/project
/// becomes
/// https/exmple..org/group/project
fn dir_from_url(base_dir: &Path, url: &Url) -> Option<PathBuf> {
if let Some(domain) = url.domain() {
let mut dir = base_dir.join(url.scheme());
dir = dir.join(domain);
let path = url.path();
if let Some(path) = path.strip_prefix('/') {
dir = dir.join(path);
} else if !path.is_empty() {
} else {
return None;
return Some(dir);
None
/// Try to clone a git repository from the given URL into
/// the given directory.
/// If the directory already exists,
/// If the repository ca
fn clone(base_dir: &Path, url: Url) {
let dir = if let Some(dir) = dir_from_url(base_dir, &url) {
dir
return;
let git_dir = dir.join(".git");
if git_dir.exists() {
check_repository(&dir);
let dir_str: String = dir.to_string_lossy().to_string();
// Clone with git executable because cloning with the git2 crate
// gives segfaults on some urls.
let output = match Command::new("git")
.arg("clone")
.arg(url.as_str())
.arg(&dir_str)
.env("GIT_TERMINAL_PROMPT", "0")
.output()
{
Ok(output) => output,
Err(_err) => return,
if !output.status.success() {
eprintln!("Could not clone {} into {}", url, dir.display());
.arg("fetch")
.arg("--tags")
.current_dir(&dir)
eprintln!("Could not fetch tags for {}", url);
/// Check the repository in the directory by opening it
/// If the repository cannot be opened, the directory is
/// removed.
fn check_repository(dir: &Path) {
let repo = match Repository::open(dir) {
Ok(repo) => repo,
Err(e) => {
println!("A {}", e);
let _ = remove_dir_all(dir);
let _head = match repo.head() {
Ok(head) => head,
println!("B {}", e);
println!("Removing {}", dir.display());
println!("C");