// This Source Code Form is subject to the terms of the Mozilla Public // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. /*! Functionality for a Python importer. This module defines a Python meta path importer and associated functionality for importing Python modules from memory. */ use std::cell::RefCell; use std::collections::{HashMap, HashSet}; use std::ffi::CStr; use std::io::Cursor; use std::sync::Arc; use byteorder::{LittleEndian, ReadBytesExt}; use cpython::exc::{FileNotFoundError, ImportError, RuntimeError, ValueError}; use cpython::{ py_class, py_class_impl, py_coerce_item, py_fn, NoArgs, ObjectProtocol, PyClone, PyDict, PyErr, PyList, PyModule, PyObject, PyResult, PyString, PyTuple, Python, PythonObject, ToPyObject, }; use python3_sys as pyffi; use python3_sys::{PyBUF_READ, PyMemoryView_FromMemory}; use super::pyinterp::PYOXIDIZER_IMPORTER_NAME; /// Obtain a Python memoryview referencing a memory slice. /// /// New memoryview allows Python to access the underlying memory without /// copying it. #[inline] fn get_memory_view(py: Python, data: &'static [u8]) -> Option { let ptr = unsafe { PyMemoryView_FromMemory(data.as_ptr() as _, data.len() as _, PyBUF_READ) }; unsafe { PyObject::from_owned_ptr_opt(py, ptr) } } /// Holds pointers to Python module data in memory. #[derive(Debug)] struct PythonModuleData { source: Option<&'static [u8]>, bytecode: Option<&'static [u8]>, } impl PythonModuleData { /// Obtain a PyMemoryView instance for source data. fn get_source_memory_view(&self, py: Python) -> Option { match self.source { Some(data) => get_memory_view(py, data), None => None, } } /// Obtain a PyMemoryView instance for bytecode data. fn get_bytecode_memory_view(&self, py: Python) -> Option { match self.bytecode { Some(data) => get_memory_view(py, data), None => None, } } } /// Represents Python modules data in memory. /// /// This is essentially an index over a raw backing blob. struct PythonModulesData { data: HashMap<&'static str, PythonModuleData>, } impl PythonModulesData { /// Construct a new instance from a memory slice. fn from(data: &'static [u8]) -> Result { let mut reader = Cursor::new(data); let count = reader .read_u32::() .or_else(|_| Err("failed reading count"))?; let mut index = Vec::with_capacity(count as usize); let mut total_names_length = 0; let mut total_sources_length = 0; for _ in 0..count { let name_length = reader .read_u32::() .or_else(|_| Err("failed reading name length"))? as usize; let source_length = reader .read_u32::() .or_else(|_| Err("failed reading source length"))? as usize; let bytecode_length = reader .read_u32::() .or_else(|_| Err("failed reading bytecode length"))? as usize; index.push((name_length, source_length, bytecode_length)); total_names_length += name_length; total_sources_length += source_length; } let mut res = HashMap::with_capacity(count as usize); let sources_start_offset = reader.position() as usize + total_names_length; let bytecodes_start_offset = sources_start_offset + total_sources_length; let mut sources_current_offset: usize = 0; let mut bytecodes_current_offset: usize = 0; for (name_length, source_length, bytecode_length) in index { let offset = reader.position() as usize; let name = unsafe { std::str::from_utf8_unchecked(&data[offset..offset + name_length]) }; let source_offset = sources_start_offset + sources_current_offset; let source = if source_length > 0 { Some(&data[source_offset..source_offset + source_length]) } else { None }; let bytecode_offset = bytecodes_start_offset + bytecodes_current_offset; let bytecode = if bytecode_length > 0 { Some(&data[bytecode_offset..bytecode_offset + bytecode_length]) } else { None }; reader.set_position(offset as u64 + name_length as u64); sources_current_offset += source_length; bytecodes_current_offset += bytecode_length; res.insert(name, PythonModuleData { source, bytecode }); } Ok(PythonModulesData { data: res }) } } /// Represents Python resources data in memory. /// /// This is essentially an index over a raw backing blob. struct PythonResourcesData { packages: HashMap<&'static str, Arc>>>, } impl PythonResourcesData { fn from(data: &'static [u8]) -> Result { let mut reader = Cursor::new(data); let package_count = reader .read_u32::() .or_else(|_| Err("failed reading package count"))? as usize; let mut index = Vec::with_capacity(package_count); let mut total_names_length = 0; for _ in 0..package_count { let package_name_length = reader .read_u32::() .or_else(|_| Err("failed reading package name length"))? as usize; let resource_count = reader .read_u32::() .or_else(|_| Err("failed reading resource count"))? as usize; total_names_length += package_name_length; let mut package_index = Vec::with_capacity(resource_count); for _ in 0..resource_count { let resource_name_length = reader .read_u32::() .or_else(|_| Err("failed reading resource name length"))? as usize; let resource_data_length = reader .read_u32::() .or_else(|_| Err("failed reading resource data length"))? as usize; total_names_length += resource_name_length; package_index.push((resource_name_length, resource_data_length)); } index.push((package_name_length, package_index)); } let mut name_offset = reader.position() as usize; let data_offset = name_offset + total_names_length; let mut res = HashMap::new(); for (package_name_length, package_index) in index { let package_name = unsafe { std::str::from_utf8_unchecked(&data[name_offset..name_offset + package_name_length]) }; name_offset += package_name_length; let mut package_data = Box::new(HashMap::new()); for (resource_name_length, resource_data_length) in package_index { let resource_name = unsafe { std::str::from_utf8_unchecked( &data[name_offset..name_offset + resource_name_length], ) }; name_offset += resource_name_length; let resource_data = &data[data_offset..data_offset + resource_data_length]; package_data.insert(resource_name, resource_data); } res.insert(package_name, Arc::new(package_data)); } Ok(PythonResourcesData { packages: res }) } } #[allow(unused_doc_comments)] /// Python type to import modules. /// /// This type implements the importlib.abc.MetaPathFinder interface for /// finding/loading modules. It supports loading various flavors of modules, /// allowing it to be the only registered sys.meta_path importer. py_class!(class PyOxidizerFinder |py| { data imp_module: PyModule; data marshal_loads: PyObject; data builtin_importer: PyObject; data frozen_importer: PyObject; data call_with_frames_removed: PyObject; data module_spec_type: PyObject; data decode_source: PyObject; data exec_fn: PyObject; data packages: HashSet<&'static str>; data known_modules: KnownModules; data resources: HashMap<&'static str, Arc>>>; data resource_readers: RefCell>>; // Start of importlib.abc.MetaPathFinder interface. def find_spec(&self, fullname: &PyString, path: &PyObject, target: Option = None) -> PyResult { let key = fullname.to_string(py)?; if let Some(flavor) = self.known_modules(py).get(&*key) { match flavor { KnownModuleFlavor::Builtin => { // BuiltinImporter.find_spec() always returns None if `path` is defined. // And it doesn't use `target`. So don't proxy these values. self.builtin_importer(py).call_method(py, "find_spec", (fullname,), None) } KnownModuleFlavor::Frozen => { self.frozen_importer(py).call_method(py, "find_spec", (fullname, path, target), None) } KnownModuleFlavor::InMemory { .. } => { let is_package = self.packages(py).contains(&*key); // TODO consider setting origin and has_location so __file__ will be // populated. let kwargs = PyDict::new(py); kwargs.set_item(py, "is_package", is_package)?; self.module_spec_type(py).call(py, (fullname, self), Some(&kwargs)) } } } else { Ok(py.None()) } } def find_module(&self, _fullname: &PyObject, _path: &PyObject) -> PyResult { // Method is deprecated. Always returns None. // We /could/ call find_spec(). Meh. Ok(py.None()) } def invalidate_caches(&self) -> PyResult { Ok(py.None()) } // End of importlib.abc.MetaPathFinder interface. // Start of importlib.abc.Loader interface. def create_module(&self, _spec: &PyObject) -> PyResult { Ok(py.None()) } def exec_module(&self, module: &PyObject) -> PyResult { let name = module.getattr(py, "__name__")?; let key = name.extract::(py)?; if let Some(flavor) = self.known_modules(py).get(&*key) { match flavor { KnownModuleFlavor::Builtin => { self.builtin_importer(py).call_method(py, "exec_module", (module,), None) }, KnownModuleFlavor::Frozen => { self.frozen_importer(py).call_method(py, "exec_module", (module,), None) }, KnownModuleFlavor::InMemory { module_data } => { match module_data.get_bytecode_memory_view(py) { Some(value) => { let code = self.marshal_loads(py).call(py, (value,), None)?; let exec_fn = self.exec_fn(py); let dict = module.getattr(py, "__dict__")?; self.call_with_frames_removed(py).call(py, (exec_fn, code, dict), None) }, None => { Err(PyErr::new::(py, ("cannot find code in memory", name))) } } }, } } else { // Raising here might make more sense, as exec_module() shouldn't // be called on the Loader that didn't create the module. Ok(py.None()) } } // End of importlib.abc.Loader interface. // Start of importlib.abc.InspectLoader interface. def get_code(&self, fullname: &PyString) -> PyResult { let key = fullname.to_string(py)?; if let Some(flavor) = self.known_modules(py).get(&*key) { match flavor { KnownModuleFlavor::Frozen => { let imp_module = self.imp_module(py); imp_module.call(py, "get_frozen_object", (fullname,), None) }, KnownModuleFlavor::InMemory { module_data } => { match module_data.get_bytecode_memory_view(py) { Some(value) => { self.marshal_loads(py).call(py, (value,), None) } None => { Err(PyErr::new::(py, ("cannot find code in memory", fullname))) } } }, KnownModuleFlavor::Builtin => { Ok(py.None()) } } } else { Ok(py.None()) } } def get_source(&self, fullname: &PyString) -> PyResult { let key = fullname.to_string(py)?; if let Some(flavor) = self.known_modules(py).get(&*key) { if let KnownModuleFlavor::InMemory { module_data } = flavor { match module_data.get_source_memory_view(py) { Some(value) => { self.decode_source(py).call(py, (value,), None) }, None => { Err(PyErr::new::(py, ("source not available", fullname))) } } } else { Ok(py.None()) } } else { Ok(py.None()) } } // End of importlib.abc.InspectLoader interface. // Support obtaining ResourceReader instances. def get_resource_loader(&self, fullname: &PyString) -> PyResult { let key = fullname.to_string(py)?; // This should not happen since code below should not be recursive into this // function. let mut resource_readers = match self.resource_readers(py).try_borrow_mut() { Ok(v) => v, Err(_) => { return Err(PyErr::new::(py, "resource reader already borrowed")); } }; // Return an existing instance if we have one. if let Some(reader) = resource_readers.get(&*key) { return Ok(reader.clone_ref(py)); } // Only create a reader if the name is a package. if self.packages(py).contains(&*key) { // Not all packages have known resources. let resources = match self.resources(py).get(&*key) { Some(v) => v.clone(), None => { let h: Box> = Box::new(HashMap::new()); Arc::new(h) } }; let reader = PyOxidizerResourceReader::create_instance(py, resources)?.into_object(); resource_readers.insert(key.to_string(), reader.clone_ref(py)); Ok(reader) } else { Ok(py.None()) } } }); #[allow(unused_doc_comments)] /// Implements in-memory reading of resource data. /// /// Implements importlib.abc.ResourceReader. py_class!(class PyOxidizerResourceReader |py| { data resources: Arc>>; /// Returns an opened, file-like object for binary reading of the resource. /// /// If the resource cannot be found, FileNotFoundError is raised. def open_resource(&self, resource: &PyString) -> PyResult { let key = resource.to_string(py)?; if let Some(data) = self.resources(py).get(&*key) { match get_memory_view(py, data) { Some(mv) => { let io_module = py.import("io")?; let bytes_io = io_module.get(py, "BytesIO")?; bytes_io.call(py, (mv,), None) } None => Err(PyErr::fetch(py)) } } else { Err(PyErr::new::(py, "resource not found")) } } /// Returns the file system path to the resource. /// /// If the resource does not concretely exist on the file system, raise /// FileNotFoundError. def resource_path(&self, _resource: &PyString) -> PyResult { Err(PyErr::new::(py, "in-memory resources do not have filesystem paths")) } /// Returns True if the named name is considered a resource. FileNotFoundError /// is raised if name does not exist. def is_resource(&self, name: &PyString) -> PyResult { let key = name.to_string(py)?; if self.resources(py).contains_key(&*key) { Ok(py.True().as_object().clone_ref(py)) } else { Err(PyErr::new::(py, "resource not found")) } } /// Returns an iterable of strings over the contents of the package. /// /// Do note that it is not required that all names returned by the iterator be actual resources, /// e.g. it is acceptable to return names for which is_resource() would be false. /// /// Allowing non-resource names to be returned is to allow for situations where how a package /// and its resources are stored are known a priori and the non-resource names would be useful. /// For instance, returning subdirectory names is allowed so that when it is known that the /// package and resources are stored on the file system then those subdirectory names can be /// used directly. def contents(&self) -> PyResult { let resources = self.resources(py); let mut names = Vec::with_capacity(resources.len()); for name in resources.keys() { names.push(name.to_py_object(py)); } let names_list = names.to_py_object(py); Ok(names_list.as_object().clone_ref(py)) } }); fn populate_packages(packages: &mut HashSet<&'static str>, name: &'static str) { let mut search = name; while let Some(idx) = search.rfind('.') { packages.insert(&search[0..idx]); search = &search[0..idx]; } } const DOC: &[u8] = b"Binary representation of Python modules\0"; /// Represents global module state to be passed at interpreter initialization time. #[derive(Debug)] pub struct InitModuleState { /// Whether to register the filesystem importer on sys.meta_path. pub register_filesystem_importer: bool, /// Values to set on sys.path. pub sys_paths: Vec, /// Raw data constituting Python module source code. pub py_modules_data: &'static [u8], /// Raw data constituting Python resources data. pub py_resources_data: &'static [u8], } /// Holds reference to next module state struct. /// /// This module state will be copied into the module's state when the /// Python module is initialized. pub static mut NEXT_MODULE_STATE: *const InitModuleState = std::ptr::null(); /// Represents which importer to use for known modules. #[derive(Debug)] enum KnownModuleFlavor { Builtin, Frozen, InMemory { module_data: PythonModuleData }, } type KnownModules = HashMap<&'static str, KnownModuleFlavor>; /// State associated with each importer module instance. /// /// We write per-module state to per-module instances of this struct so /// we don't rely on global variables and so multiple importer modules can /// exist without issue. #[derive(Debug)] struct ModuleState { /// Whether to register PathFinder on sys.meta_path. register_filesystem_importer: bool, /// Values to set on sys.path. sys_paths: Vec, /// Raw data constituting Python module source code. py_modules_data: &'static [u8], /// Raw data constituting Python resources data. py_resources_data: &'static [u8], /// Whether setup() has been called. setup_called: bool, } /// Obtain the module state for an instance of our importer module. /// /// Creates a Python exception on failure. /// /// Doesn't do type checking that the PyModule is of the appropriate type. fn get_module_state<'a>(py: Python, m: &'a PyModule) -> Result<&'a mut ModuleState, PyErr> { let ptr = m.as_object().as_ptr(); let state = unsafe { pyffi::PyModule_GetState(ptr) as *mut ModuleState }; if state.is_null() { let err = PyErr::new::(py, "unable to retrieve module state"); return Err(err); } Ok(unsafe { &mut *state }) } /// Initialize the Python module object. /// /// This is called as part of the PyInit_* function to create the internal /// module object for the interpreter. /// /// This receives a handle to the current Python interpreter and just-created /// Python module instance. It populates the internal module state and registers /// a _setup() on the module object for usage by Python. /// /// Because this function accesses NEXT_MODULE_STATE, it should only be /// called during interpreter initialization. fn module_init(py: Python, m: &PyModule) -> PyResult<()> { let mut state = get_module_state(py, m)?; unsafe { state.register_filesystem_importer = (*NEXT_MODULE_STATE).register_filesystem_importer; // TODO we could move the value if we wanted to avoid the clone(). state.sys_paths = (*NEXT_MODULE_STATE).sys_paths.clone(); state.py_modules_data = (*NEXT_MODULE_STATE).py_modules_data; state.py_resources_data = (*NEXT_MODULE_STATE).py_resources_data; } state.setup_called = false; m.add( py, "_setup", py_fn!( py, module_setup( m: PyModule, bootstrap_module: PyModule, marshal_module: PyModule, decode_source: PyObject ) ), )?; Ok(()) } /// Called after module import/initialization to configure the importing mechanism. /// /// This does the heavy work of configuring the importing mechanism. /// /// This function should only be called once as part of /// _frozen_importlib_external._install_external_importers(). fn module_setup( py: Python, m: PyModule, bootstrap_module: PyModule, marshal_module: PyModule, decode_source: PyObject, ) -> PyResult { let state = get_module_state(py, &m)?; if state.setup_called { return Err(PyErr::new::( py, "PyOxidizer _setup() already called", )); } state.setup_called = true; let imp_module = bootstrap_module.get(py, "_imp")?; let imp_module = imp_module.cast_into::(py)?; let sys_module = bootstrap_module.get(py, "sys")?; let sys_module = sys_module.cast_as::(py)?; let meta_path_object = sys_module.get(py, "meta_path")?; // We should be executing as part of // _frozen_importlib_external._install_external_importers(). // _frozen_importlib._install() should have already been called and set up // sys.meta_path with [BuiltinImporter, FrozenImporter]. Those should be the // only meta path importers present. let meta_path = meta_path_object.cast_as::(py)?; if meta_path.len(py) != 2 { return Err(PyErr::new::( py, "sys.meta_path does not contain 2 values", )); } let builtin_importer = meta_path.get_item(py, 0); let frozen_importer = meta_path.get_item(py, 1); // It may seem inefficient to create a full HashMap of the parsed data instead of e.g. // streaming it. But the overhead of iterators was measured to be more than building // up a temporary HashMap. let modules_data = match PythonModulesData::from(state.py_modules_data) { Ok(v) => v, Err(msg) => return Err(PyErr::new::(py, msg)), }; // Populate our known module lookup table with entries from builtins, frozens, and // finally us. Last write wins and has the same effect as registering our // meta path importer first. This should be safe. If nothing else, it allows // some builtins to be overwritten by .py implemented modules. let mut known_modules = KnownModules::with_capacity(modules_data.data.len() + 10); for i in 0.. { let record = unsafe { pyffi::PyImport_Inittab.offset(i) }; if unsafe { *record }.name.is_null() { break; } let name = unsafe { CStr::from_ptr((*record).name as _) }; let name_str = match name.to_str() { Ok(v) => v, Err(_) => { return Err(PyErr::new::( py, "unable to parse PyImport_Inittab", )); } }; known_modules.insert(name_str, KnownModuleFlavor::Builtin); } for i in 0.. { let record = unsafe { pyffi::PyImport_FrozenModules.offset(i) }; if unsafe { *record }.name.is_null() { break; } let name = unsafe { CStr::from_ptr((*record).name as _) }; let name_str = match name.to_str() { Ok(v) => v, Err(_) => { return Err(PyErr::new::( py, "unable to parse PyImport_FrozenModules", )); } }; known_modules.insert(name_str, KnownModuleFlavor::Frozen); } // TODO consider baking set of packages into embedded data. let mut packages: HashSet<&'static str> = HashSet::with_capacity(modules_data.data.len()); for (name, record) in modules_data.data { known_modules.insert( name, KnownModuleFlavor::InMemory { module_data: record, }, ); populate_packages(&mut packages, name); } let resources_data = match PythonResourcesData::from(state.py_resources_data) { Ok(v) => v, Err(msg) => return Err(PyErr::new::(py, msg)), }; let marshal_loads = marshal_module.get(py, "loads")?; let call_with_frames_removed = bootstrap_module.get(py, "_call_with_frames_removed")?; let module_spec_type = bootstrap_module.get(py, "ModuleSpec")?; let builtins_module = match unsafe { PyObject::from_borrowed_ptr_opt(py, pyffi::PyEval_GetBuiltins()) } { Some(o) => o.cast_into::(py), None => { return Err(PyErr::new::( py, "unable to obtain __builtins__", )); } }?; let exec_fn = match builtins_module.get_item(py, "exec") { Some(v) => v, None => { return Err(PyErr::new::( py, "could not obtain __builtins__.exec", )); } }; let resource_readers: RefCell>> = RefCell::new(Box::new(HashMap::new())); let unified_importer = PyOxidizerFinder::create_instance( py, imp_module, marshal_loads, builtin_importer, frozen_importer, call_with_frames_removed, module_spec_type, decode_source, exec_fn, packages, known_modules, resources_data.packages, resource_readers, )?; meta_path_object.call_method(py, "clear", NoArgs, None)?; meta_path_object.call_method(py, "append", (unified_importer,), None)?; // At this point the importing mechanism is fully initialized to use our // unified importer, which handles built-in, frozen, and in-memory imports. // Because we're probably running during Py_Initialize() and stdlib modules // may not be in-memory, we need to register and configure additional importers // here, before continuing with Py_Initialize(), otherwise we may not find // the standard library! if state.register_filesystem_importer { // This is what importlib._bootstrap_external usually does: // supported_loaders = _get_supported_file_loaders() // sys.path_hooks.extend([FileFinder.path_hook(*supported_loaders)]) // sys.meta_path.append(PathFinder) let frozen_importlib_external = py.import("_frozen_importlib_external")?; let loaders = frozen_importlib_external.call(py, "_get_supported_file_loaders", NoArgs, None)?; let loaders_list = loaders.cast_as::(py)?; let loaders_vec: Vec = loaders_list.iter(py).collect(); let loaders_tuple = PyTuple::new(py, loaders_vec.as_slice()); let file_finder = frozen_importlib_external.get(py, "FileFinder")?; let path_hook = file_finder.call_method(py, "path_hook", loaders_tuple, None)?; let path_hooks = sys_module.get(py, "path_hooks")?; path_hooks.call_method(py, "append", (path_hook,), None)?; let path_finder = frozen_importlib_external.get(py, "PathFinder")?; let meta_path = sys_module.get(py, "meta_path")?; meta_path.call_method(py, "append", (path_finder,), None)?; } // Ideally we should be calling Py_SetPath() before Py_Initialize() to set sys.path. // But we tried to do this and only ran into problems due to string conversions, // unwanted side-effects. Updating sys.path directly before it is used by PathFinder // (which was just registered above) should have the same effect. // Always clear out sys.path. let sys_path = sys_module.get(py, "path")?; sys_path.call_method(py, "clear", NoArgs, None)?; // And repopulate it with entries from the config. for path in &state.sys_paths { let py_path = PyString::new(py, path.as_str()); sys_path.call_method(py, "append", (py_path,), None)?; } Ok(py.None()) } static mut MODULE_DEF: pyffi::PyModuleDef = pyffi::PyModuleDef { m_base: pyffi::PyModuleDef_HEAD_INIT, m_name: std::ptr::null(), m_doc: std::ptr::null(), m_size: std::mem::size_of::() as isize, m_methods: 0 as *mut _, m_slots: 0 as *mut _, m_traverse: None, m_clear: None, m_free: None, }; /// Module initialization function. /// /// This creates the Python module object. /// /// We don't use the macros in the cpython crate because they are somewhat /// opinionated about how things should work. e.g. they call /// PyEval_InitThreads(), which is undesired. We want total control. #[allow(non_snake_case)] pub extern "C" fn PyInit__pyoxidizer_importer() -> *mut pyffi::PyObject { let py = unsafe { cpython::Python::assume_gil_acquired() }; // TRACKING RUST1.32 We can't call as_ptr() in const fn in Rust 1.31. unsafe { if MODULE_DEF.m_name.is_null() { MODULE_DEF.m_name = PYOXIDIZER_IMPORTER_NAME.as_ptr() as *const _; MODULE_DEF.m_doc = DOC.as_ptr() as *const _; } } let module = unsafe { pyffi::PyModule_Create(&mut MODULE_DEF) }; if module.is_null() { return module; } let module = match unsafe { PyObject::from_owned_ptr(py, module).cast_into::(py) } { Ok(m) => m, Err(e) => { PyErr::from(e).restore(py); return std::ptr::null_mut(); } }; match module_init(py, &module) { Ok(()) => module.into_object().steal_ptr(), Err(e) => { e.restore(py); std::ptr::null_mut() } } }