1
use git2::Repository;
2
use rayon::iter::{IntoParallelIterator, ParallelIterator};
3
use std::{
4
    collections::BTreeSet,
5
    fs::remove_dir_all,
6
    path::{Path, PathBuf},
7
    process::Command,
8
    sync::atomic::{self, AtomicUsize},
9
};
10
use url::Url;
11

            
12
/// Clone git repositories from URLs listed in the input file.
13
20
pub fn harvest(input: &Path, output_dir: &Path) {
14
20
    let urls = read_urls(input);
15
20
    let n = urls.len();
16
20
    let counter = AtomicUsize::new(0);
17
20
    urls.into_par_iter().for_each(|url| {
18
10
        let idx = counter.fetch_add(1, atomic::Ordering::SeqCst) + 1;
19
10
        println!("{}/{} Cloning {}", idx, n, url);
20
10
        clone(output_dir, url);
21
20
    });
22
20
}
23

            
24
// Read a file and try to interpret each line as a url.
25
// Return a set with the valid urls.
26
// If a line is not valid, try again with a https:// prefix
27
20
fn read_urls(input: &Path) -> BTreeSet<Url> {
28
20
    let txt = std::fs::read_to_string(input).unwrap();
29
20
    let mut urls = BTreeSet::new();
30
20
    for line in txt.lines() {
31
10
        if let Ok(url) = Url::parse(line) {
32
10
            urls.insert(url);
33
10
        } else if let Ok(url) = Url::parse(&format!("https://{line}")) {
34
            urls.insert(url);
35
        }
36
    }
37
20
    urls
38
20
}
39

            
40
/// Create a directory path from a url. The directory path is created
41
/// from the URL by taking scheme, domain and path from the URL.
42
///   <https://example.org/group/project>
43
/// becomes
44
///   https/exmple..org/group/project
45
fn dir_from_url(base_dir: &Path, url: &Url) -> Option<PathBuf> {
46
10
    if let Some(domain) = url.domain() {
47
10
        let mut dir = base_dir.join(url.scheme());
48
10
        dir = dir.join(domain);
49
10
        let path = url.path();
50
10
        if let Some(path) = path.strip_prefix('/') {
51
10
            dir = dir.join(path);
52
10
        } else if !path.is_empty() {
53
            dir = dir.join(path);
54
        } else {
55
            return None;
56
        }
57
10
        return Some(dir);
58
    }
59
    None
60
10
}
61

            
62
/// Try to clone a git repository from the given URL into
63
/// the given directory.
64
/// If the directory already exists,
65
/// If the repository ca
66
10
fn clone(base_dir: &Path, url: Url) {
67
10
    let dir = if let Some(dir) = dir_from_url(base_dir, &url) {
68
10
        dir
69
    } else {
70
        return;
71
    };
72
10
    let git_dir = dir.join(".git");
73
10
    if git_dir.exists() {
74
        check_repository(&dir);
75
        return;
76
10
    }
77
10
    let dir_str: String = dir.to_string_lossy().to_string();
78
    // Clone with git executable because cloning with the git2 crate
79
    // gives segfaults on some urls.
80
10
    let output = match Command::new("git")
81
10
        .arg("clone")
82
10
        .arg(url.as_str())
83
10
        .arg(&dir_str)
84
10
        .env("GIT_TERMINAL_PROMPT", "0")
85
10
        .output()
86
    {
87
10
        Ok(output) => output,
88
        Err(_err) => return,
89
    };
90
10
    if !output.status.success() {
91
        eprintln!("Could not clone {} into {}", url, dir.display());
92
        return;
93
10
    }
94
10
    let output = match Command::new("git")
95
10
        .arg("fetch")
96
10
        .arg("--tags")
97
10
        .current_dir(&dir)
98
10
        .output()
99
    {
100
10
        Ok(output) => output,
101
        Err(_err) => return,
102
    };
103
10
    if !output.status.success() {
104
        eprintln!("Could not fetch tags for {}", url);
105
10
    }
106
10
    check_repository(&dir);
107
10
}
108

            
109
/// Check the repository in the directory by opening it
110
/// If the repository cannot be opened, the directory is
111
/// removed.
112
10
fn check_repository(dir: &Path) {
113
10
    let repo = match Repository::open(dir) {
114
10
        Ok(repo) => repo,
115
        Err(e) => {
116
            println!("A {}", e);
117
            let _ = remove_dir_all(dir);
118
            return;
119
        }
120
    };
121
10
    let _head = match repo.head() {
122
10
        Ok(head) => head,
123
        Err(e) => {
124
            println!("B {}", e);
125
            println!("Removing {}", dir.display());
126
            let _ = remove_dir_all(dir);
127
            return;
128
        }
129
    };
130
10
    println!("C");
131
10
}