diff --git a/.gitignore b/.gitignore index a15a8d1a..8eca0f54 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,8 @@ /.coverage /.tox /dist + + +# Added by cargo + +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 00000000..258e507e --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,108 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "identify" +version = "0.0.1" +dependencies = [ + "phf", +] + +[[package]] +name = "phf" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c" +dependencies = [ + "phf_macros", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1181c94580fa345f50f19d738aaa39c0ed30a600d95cb2d3e23f94266f14fbf" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_macros" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92aacdc5f16768709a569e913f7451034034178b05bdc8acda226659a3dccc66" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "phf_shared" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676" +dependencies = [ + "siphasher", +] + +[[package]] +name = "proc-macro2" +version = "1.0.49" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57a8eca9f9c4ffde41714334dee777596264c7825420f521abc92b5b5deb63a5" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" + +[[package]] +name = "siphasher" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" + +[[package]] +name = "syn" +version = "1.0.107" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..db5e0e86 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "identify" +version = "0.0.1" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +phf = { "version" = "0.11.1", "features" = ["macros"] } diff --git a/build.rs b/build.rs new file mode 100644 index 00000000..0a388709 --- /dev/null +++ b/build.rs @@ -0,0 +1,78 @@ +use std::collections::HashMap; +use std::collections::HashSet; +use std::env; +use std::fs; +use std::path::Path; + +type Dict = HashMap>; + +fn serialize_map(map: Dict, filename: &Path) { + let mut lines: Vec = ["phf_map!(\n".into()].into(); + for (ext, tags) in map.iter() { + lines.push(format!(r#" "{ext}" => phf_set!("#)); + for tag in tags { + lines.push(format!(r#""{tag}", "#)); + } + lines.push("),\n".into()); + } + lines.push(")".into()); + fs::write(filename, lines.join("")).unwrap(); +} + +fn main() { + // We want to create a series of hashmaps from + // identify/{extensions,interpreters}.py + // and place them in `out_dir/{extensions,interpreters}.rs` + // (or name each file after the dict, I suppose) + + let mut extensions: Dict = HashMap::new(); + let mut extensions_need_binary_check: Dict = HashMap::new(); + let mut names: Dict = HashMap::new(); + let mut interpreters: Dict = HashMap::new(); + let mut current_dict = String::new(); + + // take a python file + let mut python = fs::read_to_string("identify/extensions.py").unwrap(); + python.push_str(&fs::read_to_string("identify/interpreters.py").unwrap()); + + // read the dicts into hashmaps + for line in python.lines() { + if let Some((dict_name, _)) = line.split_once('=') { + current_dict = dict_name.trim().into(); + } + else if let Some((ext, tags)) = line.split_once(':') { + let ext = ext.trim().replace('\'', "").to_string(); + let tags: HashSet = tags.trim() + .split(',') + .map(|tag| + tag.trim().replace(|c| "'{}".contains(c), "") + ) + .filter(|tag| !tag.is_empty()) + .collect(); + + match current_dict.as_str() { + "EXTENSIONS" => extensions.insert(ext, tags), + "EXTENSIONS_NEED_BINARY_CHECK" => { + extensions_need_binary_check.insert(ext, tags) + }, + "NAMES" => names.insert(ext, tags), + "INTERPRETERS" => interpreters.insert(ext, tags), + _ => panic!("Unexpected dict name: {current_dict}"), + }; + } + } + + // write them into a rust file + let out_dir = env::var_os("OUT_DIR").unwrap(); + + let extensions_rs = Path::new(&out_dir).join("extensions.rs"); + let enbc_rs = Path::new(&out_dir).join("extensions_need_binary_check.rs"); + let names_rs = Path::new(&out_dir).join("names.rs"); + let interpreters_rs = Path::new(&out_dir).join("interpreters.rs"); + serialize_map(extensions, &extensions_rs); + serialize_map(extensions_need_binary_check, &enbc_rs); + serialize_map(names, &names_rs); + serialize_map(interpreters, &interpreters_rs); + + println!("cargo:rerun-if-changed=build.rs"); +} diff --git a/src/identify.rs b/src/identify.rs new file mode 100644 index 00000000..cd6fc6d7 --- /dev/null +++ b/src/identify.rs @@ -0,0 +1,155 @@ +#![allow(dead_code)] +#![allow(unused_imports)] +#![allow(unused_variables)] +#![allow(unused_mut)] + +use std::collections::HashMap; +use std::collections::HashSet; +use std::ffi::OsStr; +use std::ffi::OsString; +use std::fs; +use std::os::unix::fs::FileTypeExt; // For `filetype.is_socket()` apparently +// use std::os::unix::fs::PermissionsExt; // fs::Permissions `mode()` +use std::os::unix::fs::MetadataExt; // fs::Permissions `mode()` +use std::path::Path; + +use crate::tags; + +#[derive(Debug, Eq, Hash, PartialEq)] +pub enum Tags { + Directory, + Symlink, + Socket, + File, + Executable, + NonExecutable, + Text, + Binary, +} + +pub fn tags_from_path(file_path: &str) -> HashSet { + let file = Path::new(file_path); + // TODO: Convert to Error + if !file.exists() { + panic!("{file_path} does not exist."); + } + + let metadata = fs::symlink_metadata(&file); + + if let Ok(metadata) = metadata { + // let perms = metadata.mode() & 0o777; + // println!("{:o}", perms); + if metadata.is_symlink() { + return HashSet::from([Tags::Symlink]); + } + if metadata.is_dir() { + return HashSet::from([Tags::Directory]); + } + if metadata.file_type().is_socket() { + return HashSet::from([Tags::Socket]); + } + } + + let tags = HashSet::from([Tags::File]); + // TODO + // If executable, add to `tags` + + let t = tags_from_filename(file_path); + // see if we can get tags_from_filename() and if not, + // then... weird parse_shebang stuff? + + // a lil more. reread it when not tired. + tags +} + +pub fn tags_from_filename(filename: &str) -> HashSet { + let path = Path::new(filename); + let filename = path.file_name().unwrap().to_str().unwrap().to_string(); + let ext = path.extension().unwrap().to_str().unwrap().to_lowercase(); + + let mut ret = HashSet::new(); + /* + let _: Vec<&str> = filename.split('.').collect(); + let mut parts = Vec::from([filename.clone()]); + parts.extend(filename.split('.').map(|s| s.to_string())); + + for part in parts { + if tags::NAMES.contains_key(&part) { + println!("{:?}", tags::NAMES[&part]); + // ret.push(tags::NAMES[&part]); + } + println!("Boop: {}", part); + } + */ + + if tags::EXTENSIONS.contains_key(&ext) { + ret.extend(tags::EXTENSIONS[&ext].iter().map(|s| s.to_string())); + } else if tags::EXTENSIONS_NEED_BINARY_CHECK.contains_key(&ext) { + ret.extend( + tags::EXTENSIONS_NEED_BINARY_CHECK[&ext] + .iter() + .map(|s| s.to_string()), + ); + } + /* + for part in Vec::from([ + filename.clone(), + filename.split('.').map(|s| s.to_string()).collect() + ]) { + println!("Boop: {}", part); + } + */ + + // identify.py creates a set, then, + // if filename + filename.split('.') items in extensions.NAMES, + // add to set and break + /* + let mut map = HashSet::new(); + if filename in extensions::names() { + map.insert(extension); + } + */ + + // if there's an extension, + // lowercase it, + // then if it's in extension.EXTENSIONS, add to set + // or if it's in extension.EXTENSIONS_NEED_BINARY_CHECK, add to set + // return set + + /* + let mut tags: HashSet = HashSet::new(); + if let Some(name) = path.file_name().and_then(OsStr::to_str) { + tags.insert(name.to_owned()); + } + if let Some(ext) = path.extension().and_then(OsStr::to_str) { + tags.insert(ext.to_owned()); + } + */ + // Get filename and extension + // Allow "Dockerfile.xenial" to also match "Dockerfile" + // If filename in extensions.NAMES, add + // If extension in EXTENSIONS, add + // tags + ret +} + +pub fn tags_from_interpreter(interpreter: &str) -> HashSet { + HashSet::new() +} + +pub fn is_text(/* bytes io */) -> bool { + false +} + +pub fn file_is_text(path: &str) -> bool { + false +} + +/* +pub fn parse_shebang( /* bytesio */) -> tuple of unknown size? { +} + + +pub fn parse_shebang_from_file(path: PathBuf) -> tuple of unknown size? { +} +*/ diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 00000000..d13da550 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,18 @@ +use std::env; + +mod identify; +mod tags; + +fn main() { + let args: Vec = env::args().skip(1).collect(); + if args.len() < 1 { + eprintln!("Usage: identify [--filename-only] FILE"); + return; + } + + if args[0] == "--filename-only" { + println!("{:?}", identify::tags_from_filename(&args[1])); + } else { + println!("{:?}", identify::tags_from_path(&args[0])); + } +} diff --git a/src/tags.rs b/src/tags.rs new file mode 100644 index 00000000..e1534e5a --- /dev/null +++ b/src/tags.rs @@ -0,0 +1,15 @@ +use phf::phf_map; +use phf::phf_set; + + +pub const NAMES: phf::Map<&str, phf::Set<&str>> = + include!(concat!(env!("OUT_DIR"), "/names.rs")); + +pub const EXTENSIONS: phf::Map<&str, phf::Set<&str>> = + include!(concat!(env!("OUT_DIR"), "/extensions.rs")); + +pub const EXTENSIONS_NEED_BINARY_CHECK: phf::Map<&str, phf::Set<&str>> = + include!(concat!(env!("OUT_DIR"), "/extensions_need_binary_check.rs")); + +pub const INTERPRETERS: phf::Map<&str, phf::Set<&str>> = + include!(concat!(env!("OUT_DIR"), "/interpreters.rs"));