1
use git2::Repository;
2
use rayon::iter::{IntoParallelIterator, ParallelIterator};
3
use std::{
4
    collections::BTreeSet,
5
    fs::remove_dir_all,
6
    path::{Path, PathBuf},
7
    process::Command,
8
    sync::atomic::{self, AtomicUsize},
9
};
10
use url::Url;
11

            
12
/// Clone git repositories from URLs listed in the input file.
13
pub fn harvest(input: &Path, output_dir: &Path) {
14
    let urls = read_urls(input);
15
    let n = urls.len();
16
    let counter = AtomicUsize::new(0);
17
    urls.into_par_iter().for_each(|url| {
18
        let idx = counter.fetch_add(1, atomic::Ordering::SeqCst) + 1;
19
        println!("{}/{} Cloning {}", idx, n, url);
20
        clone(output_dir, url);
21
    });
22
}
23

            
24
// Read a file and try to interpret each line as a url.
25
// Return a set with the valid urls.
26
// If a line is not valid, try again with a https:// prefix
27
fn read_urls(input: &Path) -> BTreeSet<Url> {
28
    let txt = std::fs::read_to_string(input).unwrap();
29
    let mut urls = BTreeSet::new();
30
    for line in txt.lines() {
31
        if let Ok(url) = Url::parse(line) {
32
            urls.insert(url);
33
        } else if let Ok(url) = Url::parse(&format!("https://{line}")) {
34
            urls.insert(url);
35
        }
36
    }
37
    urls
38
}
39

            
40
/// Create a directory path from a url. The directory path is created
41
/// from the URL by taking scheme, domain and path from the URL.
42
///   https://example.org/group/project
43
/// becomes
44
///   https/exmple..org/group/project
45
fn dir_from_url(base_dir: &Path, url: &Url) -> Option<PathBuf> {
46
    if let Some(domain) = url.domain() {
47
        let mut dir = base_dir.join(url.scheme());
48
        dir = dir.join(domain);
49
        let path = url.path();
50
        if let Some(path) = path.strip_prefix('/') {
51
            dir = dir.join(path);
52
        } else if !path.is_empty() {
53
            dir = dir.join(path);
54
        } else {
55
            return None;
56
        }
57
        return Some(dir);
58
    }
59
    None
60
}
61

            
62
/// Try to clone a git repository from the given URL into
63
/// the given directory.
64
/// If the directory already exists,
65
/// If the repository ca
66
fn clone(base_dir: &Path, url: Url) {
67
    let dir = if let Some(dir) = dir_from_url(base_dir, &url) {
68
        dir
69
    } else {
70
        return;
71
    };
72
    let git_dir = dir.join(".git");
73
    if git_dir.exists() {
74
        check_repository(&dir);
75
        return;
76
    }
77
    let dir_str: String = dir.to_string_lossy().to_string();
78
    // Clone with git executable because cloning with the git2 crate
79
    // gives segfaults on some urls.
80
    let output = match Command::new("git")
81
        .arg("clone")
82
        .arg(url.as_str())
83
        .arg(&dir_str)
84
        .env("GIT_TERMINAL_PROMPT", "0")
85
        .output()
86
    {
87
        Ok(output) => output,
88
        Err(_err) => return,
89
    };
90
    if !output.status.success() {
91
        eprintln!("Could not clone {} into {}", url, dir.display());
92
        return;
93
    }
94
    let output = match Command::new("git")
95
        .arg("fetch")
96
        .arg("--tags")
97
        .current_dir(&dir)
98
        .output()
99
    {
100
        Ok(output) => output,
101
        Err(_err) => return,
102
    };
103
    if !output.status.success() {
104
        eprintln!("Could not fetch tags for {}", url);
105
    }
106
    check_repository(&dir);
107
}
108

            
109
/// Check the repository in the directory by opening it
110
/// If the repository cannot be opened, the directory is
111
/// removed.
112
fn check_repository(dir: &Path) {
113
    let repo = match Repository::open(dir) {
114
        Ok(repo) => repo,
115
        Err(e) => {
116
            println!("A {}", e);
117
            let _ = remove_dir_all(dir);
118
            return;
119
        }
120
    };
121
    let _head = match repo.head() {
122
        Ok(head) => head,
123
        Err(e) => {
124
            println!("B {}", e);
125
            println!("Removing {}", dir.display());
126
            let _ = remove_dir_all(dir);
127
            return;
128
        }
129
    };
130
    println!("C");
131
}