You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
651 lines
20 KiB
Rust
651 lines
20 KiB
Rust
#![cfg(feature = "host")]
|
|
|
|
use crate::cpu_list::CpuList;
|
|
use crate::{
|
|
GlobalConfig, InstanceConfig, QemuCommandBuilder, VfioConfig, VirtualMachineInfo,
|
|
VirtualMachineState,
|
|
};
|
|
use anyhow::{Context, Error};
|
|
use beau_collector::BeauCollector;
|
|
use libc::{cpu_set_t, sched_setaffinity, CPU_SET};
|
|
use qapi::qmp::{Event, QMP};
|
|
use qapi::Qmp;
|
|
use qapi_qmp::QmpCommand;
|
|
use std::fmt::{Debug, Formatter};
|
|
use std::fs::{read_dir, read_link, OpenOptions};
|
|
use std::io;
|
|
use std::io::{BufReader, ErrorKind, Read, Write};
|
|
use std::option::Option::Some;
|
|
use std::os::unix::net::UnixStream;
|
|
use std::os::unix::prelude::AsRawFd;
|
|
use std::path::{Path, PathBuf};
|
|
use std::process::{Child, Command};
|
|
use std::result::Result::Ok;
|
|
use std::slice::Iter;
|
|
use std::str::FromStr;
|
|
use std::sync::{Arc, Mutex, MutexGuard};
|
|
use std::time::{Duration, Instant};
|
|
use std::{fmt, mem};
|
|
|
|
#[derive(Debug)]
|
|
pub struct VirtualMachine {
|
|
working_dir: PathBuf,
|
|
state: VirtualMachineState,
|
|
config: InstanceConfig,
|
|
global_config: GlobalConfig,
|
|
process: Option<Child>,
|
|
control_socket: Option<ControlSocket>,
|
|
quit_after_shutdown: bool,
|
|
}
|
|
|
|
struct ControlSocket {
|
|
unix_stream: CloneableUnixStream,
|
|
qmp: Qmp<qapi::Stream<BufReader<CloneableUnixStream>, CloneableUnixStream>>,
|
|
_info: QMP,
|
|
}
|
|
|
|
impl Debug for ControlSocket {
|
|
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
|
|
f.debug_tuple("ControlSocket")
|
|
.field(&self.unix_stream)
|
|
.finish()
|
|
}
|
|
}
|
|
|
|
const AUTO_UNBIND_BLACKLIST: &[&str] = &["nvidia", "amdgpu"];
|
|
|
|
impl VirtualMachine {
|
|
pub fn new<P: AsRef<Path>>(
|
|
config: InstanceConfig,
|
|
global_config: &GlobalConfig,
|
|
working_dir: P,
|
|
) -> VirtualMachine {
|
|
VirtualMachine {
|
|
working_dir: working_dir.as_ref().to_path_buf(),
|
|
state: VirtualMachineState::Loaded,
|
|
config,
|
|
global_config: global_config.clone(),
|
|
process: None,
|
|
control_socket: None,
|
|
quit_after_shutdown: true,
|
|
}
|
|
}
|
|
|
|
pub fn vfio_devices(&self) -> Iter<'_, VfioConfig> {
|
|
self.config.vfio.iter()
|
|
}
|
|
|
|
pub fn name(&self) -> &str {
|
|
&self.config.name
|
|
}
|
|
|
|
pub fn info(&self) -> VirtualMachineInfo {
|
|
VirtualMachineInfo {
|
|
name: self.name().to_string(),
|
|
working_dir: self.working_dir.clone(),
|
|
config: self.config.clone(),
|
|
state: self.state,
|
|
quit_after_shutdown: self.quit_after_shutdown,
|
|
}
|
|
}
|
|
|
|
pub fn prepare(&mut self, execute_fixes: bool, force: bool) -> Result<(), anyhow::Error> {
|
|
let mut results = vec![];
|
|
results.extend(self.prepare_disks());
|
|
results.extend(self.prepare_vfio(execute_fixes, force));
|
|
results.extend(self.prepare_shm());
|
|
results.extend(self.prepare_sockets());
|
|
results
|
|
.into_iter()
|
|
.bcollect::<()>()
|
|
.with_context(|| format!("Failed to prepare VM {}", self.config.name))?;
|
|
|
|
if self.state == VirtualMachineState::Loaded {
|
|
self.state = VirtualMachineState::Prepared;
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
pub fn prepare_shm(&mut self) -> Vec<Result<(), anyhow::Error>> {
|
|
let mut shm = vec![];
|
|
if self.config.looking_glass.enabled {
|
|
if self.config.looking_glass.mem_path.is_empty() {
|
|
self.config.looking_glass.mem_path =
|
|
format!("/dev/shm/vore/{}/looking-glass", self.config.name);
|
|
}
|
|
|
|
shm.push(&self.config.looking_glass.mem_path);
|
|
}
|
|
|
|
if self.config.scream.enabled {
|
|
if self.config.scream.mem_path.is_empty() {
|
|
self.config.scream.mem_path = format!("/dev/shm/vore/{}/scream", self.config.name);
|
|
}
|
|
|
|
shm.push(&self.config.scream.mem_path);
|
|
}
|
|
|
|
shm.into_iter()
|
|
.map(|x| Path::new(x))
|
|
.filter_map(|x| x.parent())
|
|
.filter(|x| !x.is_dir())
|
|
.map(|x| {
|
|
std::fs::create_dir_all(&x).with_context(|| {
|
|
format!("Failed creating directories for shared memory ({:?})", x)
|
|
})
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
pub fn prepare_sockets(&mut self) -> Vec<Result<(), anyhow::Error>> {
|
|
let mut sockets = vec![];
|
|
if self.config.spice.enabled {
|
|
if self.config.spice.socket_path.is_empty() {
|
|
self.config.spice.socket_path = self
|
|
.working_dir
|
|
.join("spice.sock")
|
|
.to_str()
|
|
.unwrap()
|
|
.to_string();
|
|
}
|
|
|
|
sockets.push(&self.config.spice.socket_path);
|
|
}
|
|
|
|
sockets
|
|
.into_iter()
|
|
.map(|x| Path::new(x))
|
|
.filter_map(|x| x.parent())
|
|
.filter(|x| !x.is_dir())
|
|
.map(|x| {
|
|
std::fs::create_dir_all(&x).with_context(|| {
|
|
format!("Failed creating directories for shared memory ({:?})", x)
|
|
})
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
///
|
|
/// Doesn't really prepare them, but mostly checks if the user has permissions to read them
|
|
///
|
|
pub fn prepare_disks(&self) -> Vec<Result<(), anyhow::Error>> {
|
|
self.config
|
|
.disks
|
|
.iter()
|
|
.map(|disk| {
|
|
OpenOptions::new()
|
|
.read(true)
|
|
.open(&disk.path)
|
|
.with_context(|| format!("Failed to open disk {}", disk.path))?;
|
|
|
|
Ok(())
|
|
})
|
|
.collect::<Vec<_>>()
|
|
}
|
|
|
|
/// Prepare VFIO related shenanigans,
|
|
/// This includes if requested via [execute_fixes] unbinding the requested vfio pci devices
|
|
/// And binding them to vfio-pci
|
|
///
|
|
/// With [execute_fixes] set to false, it will only check if everything is sane, and the correct driver is loaded
|
|
///
|
|
/// [force] can be given to auto-bind PCI devices that are blacklisted anyway. this can result in vore indefinitely hanging.
|
|
fn prepare_vfio(&mut self, execute_fixes: bool, force: bool) -> Vec<Result<(), Error>> {
|
|
if self.config.vfio.is_empty() {
|
|
return vec![];
|
|
}
|
|
|
|
match Command::new("modprobe")
|
|
.arg("vfio-pci")
|
|
.spawn()
|
|
.and_then(|mut x| x.wait())
|
|
{
|
|
Err(err) => return vec![Err(err.into())],
|
|
Ok(x) if !x.success() => {
|
|
return vec![Err(anyhow::anyhow!(
|
|
"Failed to load vfio-pci kernel module. can't use VFIO"
|
|
))];
|
|
}
|
|
Ok(_) => {}
|
|
}
|
|
|
|
self.config
|
|
.vfio
|
|
.iter()
|
|
.map(|vfio| VirtualMachine::prepare_vfio_device(execute_fixes, force, vfio))
|
|
.collect::<Vec<_>>()
|
|
}
|
|
|
|
pub fn should_auto_start(&self) -> bool {
|
|
self.config.auto_start
|
|
}
|
|
|
|
pub fn prepare_vfio_device(
|
|
execute_fixes: bool,
|
|
force: bool,
|
|
vfio: &VfioConfig,
|
|
) -> Result<(), Error> {
|
|
let pci_driver_path = format!("/sys/bus/pci/devices/{:#}/driver", vfio.address);
|
|
|
|
let driver = match read_link(&pci_driver_path) {
|
|
Ok(driver_link) => {
|
|
let driver_path = driver_link.to_str().ok_or_else(|| {
|
|
anyhow::anyhow!(
|
|
"Path to device driver for PCI device at {} is not valid utf-8",
|
|
vfio.address
|
|
)
|
|
})?;
|
|
let driver = driver_path.split('/').last().ok_or_else(|| {
|
|
anyhow::anyhow!(
|
|
"Path to device driver for PCI device at {} doesn't have a path to a driver",
|
|
vfio.address
|
|
)
|
|
})?;
|
|
|
|
driver.to_string()
|
|
}
|
|
|
|
Err(err) if err.kind() == ErrorKind::NotFound => "".to_string(),
|
|
|
|
Err(err) => return Err(err.into()),
|
|
};
|
|
|
|
let is_blacklisted = AUTO_UNBIND_BLACKLIST.contains(&driver.as_str()) && !force;
|
|
|
|
if driver != "vfio-pci" && (!execute_fixes || is_blacklisted) {
|
|
if !driver.is_empty() && is_blacklisted {
|
|
anyhow::bail!("PCI device {} it's current driver is {}, but to be used with VFIO needs to be set to vfio-pci, this driver ({1}) has been blacklisted from automatic rebinding because it can't be cleanly unbound, please make sure this device is unbound before running vore", vfio.address, driver)
|
|
} else if !driver.is_empty() {
|
|
anyhow::bail!("PCI device {} it's current driver is {}, but to be used with VFIO needs to be set to vfio-pci", vfio.address, driver)
|
|
} else {
|
|
anyhow::bail!("PCI device at {} currently has no driver, but to be used with VFIO needs to be set to vfio-pci", vfio.address)
|
|
}
|
|
}
|
|
|
|
if driver != "vfio-pci" && execute_fixes && !is_blacklisted {
|
|
let address = format!("{:#}\n", vfio.address).into_bytes();
|
|
|
|
if !driver.is_empty() {
|
|
// Unbind the PCI device from the current driver
|
|
let mut unbind = std::fs::OpenOptions::new().append(true).open(format!(
|
|
"/sys/bus/pci/devices/{:#}/driver/unbind",
|
|
vfio.address
|
|
))?;
|
|
|
|
unbind.write_all(&address)?;
|
|
}
|
|
|
|
{
|
|
// Set a driver override
|
|
let mut driver_override = OpenOptions::new().append(true).open(format!(
|
|
"/sys/bus/pci/devices/{:#}/driver_override",
|
|
vfio.address
|
|
))?;
|
|
|
|
driver_override.write_all(b"vfio-pci\n")?;
|
|
}
|
|
|
|
{
|
|
// Probe the PCI device so the driver override is picked up
|
|
let mut probe = OpenOptions::new()
|
|
.append(true)
|
|
.open("/sys/bus/pci/drivers_probe")?;
|
|
probe.write_all(&address)?;
|
|
}
|
|
|
|
let new_link = read_link(&pci_driver_path)?;
|
|
if !new_link.ends_with("vfio-pci") {
|
|
anyhow::bail!("Tried to bind {} to vfio-pci but failed to do so (see /sys/bus/pci/devices/{:#} for more info)", vfio.address, vfio.address)
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub fn get_cmd_line(&self) -> Result<Vec<String>, anyhow::Error> {
|
|
let builder = QemuCommandBuilder::new(&self.global_config, self.working_dir.clone())?;
|
|
builder.build(&self.config)
|
|
}
|
|
|
|
pub fn pin_qemu_threads(&self) -> Result<(), anyhow::Error> {
|
|
let pid = if let Some(child) = &self.process {
|
|
child.id()
|
|
} else {
|
|
return Ok(());
|
|
};
|
|
|
|
let list = CpuList::adjacent(self.config.cpu.amount as usize);
|
|
if list.is_none() {
|
|
// If we are over provisioning CPU's there's not much use to pinning
|
|
return Ok(());
|
|
}
|
|
|
|
let list = list.unwrap();
|
|
|
|
let mut kvm_threads = vec![];
|
|
for item in read_dir(format!("/proc/{}/task", pid))? {
|
|
let entry = item?;
|
|
if !entry.file_type()?.is_dir() {
|
|
continue;
|
|
}
|
|
|
|
let res = entry
|
|
.file_name()
|
|
.to_str()
|
|
.ok_or_else(|| anyhow::anyhow!(""))
|
|
.and_then(|x| usize::from_str(x).map_err(From::from));
|
|
if res.is_err() {
|
|
continue;
|
|
}
|
|
|
|
let tid = res.unwrap();
|
|
let name = entry.path().join("comm");
|
|
let comm = std::fs::read_to_string(name)?;
|
|
if comm.starts_with("CPU ") {
|
|
let nr = comm
|
|
.chars()
|
|
.skip(4)
|
|
.take_while(|x| x.is_ascii_digit())
|
|
.collect::<String>();
|
|
let cpu_id = usize::from_str(&nr).unwrap();
|
|
kvm_threads.push((tid, cpu_id));
|
|
}
|
|
}
|
|
|
|
for (tid, cpu_id) in kvm_threads {
|
|
if cpu_id >= list.len() {
|
|
// ???
|
|
continue;
|
|
}
|
|
|
|
let cpu = &list[cpu_id];
|
|
unsafe {
|
|
let mut set = mem::zeroed::<cpu_set_t>();
|
|
CPU_SET(cpu.id, &mut set);
|
|
sched_setaffinity(tid as i32, mem::size_of::<cpu_set_t>(), &set);
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub fn boop(&mut self) -> Result<(), anyhow::Error> {
|
|
if let Some(qmp) = self.control_socket.as_mut() {
|
|
qmp.qmp.nop()?;
|
|
}
|
|
|
|
self.process_qmp_events()?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn process_qmp_events(&mut self) -> anyhow::Result<()> {
|
|
let events = if let Some(qmp) = self.control_socket.as_mut() {
|
|
// While we could iter, we keep hold of the mutable reference, so it's easier to just collect the events
|
|
qmp.qmp.events().collect::<Vec<_>>()
|
|
} else {
|
|
return Ok(());
|
|
};
|
|
|
|
for event in events {
|
|
log::info!("vm {} got event: {:?}", self.name(), event);
|
|
|
|
match event {
|
|
Event::STOP { .. } => {
|
|
if self.state == VirtualMachineState::Running {
|
|
self.state = VirtualMachineState::Paused;
|
|
}
|
|
}
|
|
Event::RESUME { .. } => {
|
|
self.state = VirtualMachineState::Running;
|
|
}
|
|
Event::SHUTDOWN { .. } => {
|
|
self.state = VirtualMachineState::Stopped;
|
|
|
|
if self.quit_after_shutdown {
|
|
self.quit()?;
|
|
}
|
|
}
|
|
|
|
_ => {}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub fn pause(&mut self) -> Result<(), anyhow::Error> {
|
|
if self.state != VirtualMachineState::Running {
|
|
return Ok(());
|
|
}
|
|
|
|
self.send_qmp_command(&qapi_qmp::stop {})?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn send_qmp_command<C: QmpCommand>(&mut self, command: &C) -> Result<C::Ok, anyhow::Error> {
|
|
let res = if let Some(qmp) = self.control_socket.as_mut() {
|
|
qmp.qmp.execute(command)?
|
|
} else {
|
|
anyhow::bail!("No control socket available")
|
|
};
|
|
|
|
self.process_qmp_events()?;
|
|
Ok(res)
|
|
}
|
|
|
|
pub fn stop(&mut self) -> Result<(), anyhow::Error> {
|
|
if self.process.is_none()
|
|
|| self.control_socket.is_none()
|
|
|| self.state == VirtualMachineState::Stopped
|
|
{
|
|
return Ok(());
|
|
}
|
|
|
|
self.send_qmp_command(&qapi_qmp::system_powerdown {})?;
|
|
Ok(())
|
|
}
|
|
|
|
pub fn wait_till_stopped(&mut self) -> Result<(), anyhow::Error> {
|
|
self.wait(None, VirtualMachineState::Stopped)?;
|
|
Ok(())
|
|
}
|
|
|
|
pub fn quit(&mut self) -> Result<(), anyhow::Error> {
|
|
if self.control_socket.is_none() {
|
|
return Ok(());
|
|
}
|
|
|
|
match self.send_qmp_command(&qapi_qmp::quit {}) {
|
|
Err(err)
|
|
if err.downcast_ref::<io::Error>().map_or(false, |x| {
|
|
x.kind() == io::ErrorKind::UnexpectedEof
|
|
|| x.kind() == io::ErrorKind::ConnectionReset
|
|
}) => {}
|
|
err => {
|
|
err?;
|
|
}
|
|
}
|
|
|
|
if let Some(mut proc) = self.process.take() {
|
|
let _ = proc.wait();
|
|
}
|
|
|
|
self.control_socket = None;
|
|
self.state = VirtualMachineState::Prepared;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn wait(
|
|
&mut self,
|
|
duration: Option<Duration>,
|
|
target_state: VirtualMachineState,
|
|
) -> Result<bool, anyhow::Error> {
|
|
let start = Instant::now();
|
|
while duration.map_or(true, |dur| (Instant::now() - start) < dur) {
|
|
let has_socket = self
|
|
.control_socket
|
|
.as_mut()
|
|
.map(|x| x.qmp.nop())
|
|
.transpose()?
|
|
.is_some();
|
|
|
|
if !has_socket {
|
|
return Ok(self.state == target_state);
|
|
}
|
|
|
|
self.process_qmp_events()?;
|
|
|
|
if self.state == target_state {
|
|
return Ok(true);
|
|
}
|
|
|
|
if duration.is_some() {
|
|
std::thread::sleep(Duration::from_millis(500));
|
|
} else {
|
|
std::thread::sleep(Duration::from_secs(5));
|
|
}
|
|
}
|
|
|
|
Ok(self.state == target_state)
|
|
}
|
|
|
|
pub fn start(&mut self) -> Result<(), anyhow::Error> {
|
|
if let Some(proc) = &mut self.process {
|
|
if proc.try_wait()?.is_none() {
|
|
return Ok(());
|
|
}
|
|
}
|
|
|
|
if self.state == VirtualMachineState::Loaded {
|
|
self.prepare(true, false)?
|
|
}
|
|
|
|
let mut command = Command::new("qemu-system-x86_64");
|
|
command.args(
|
|
self.get_cmd_line()
|
|
.context("Failed to generate qemu command line")?,
|
|
);
|
|
self.process = Some(command.spawn()?);
|
|
|
|
let mut res = || {
|
|
let qemu_control_socket = format!("{}/qemu.sock", self.working_dir.to_str().unwrap());
|
|
let mut unix_stream = UnixStream::connect(&qemu_control_socket);
|
|
let mut time = 30;
|
|
while let Err(err) = unix_stream {
|
|
if time < 0 {
|
|
Err(err).context(format!(
|
|
"After 30 seconds, QEMU Control socket ({}) didn't come up",
|
|
qemu_control_socket
|
|
))?;
|
|
}
|
|
|
|
std::thread::sleep(Duration::from_secs(1));
|
|
unix_stream = UnixStream::connect(&qemu_control_socket);
|
|
|
|
if let Some(proc) = self.process.as_mut() {
|
|
if proc.try_wait()?.is_some() {
|
|
anyhow::bail!("QEMU quit early")
|
|
}
|
|
}
|
|
|
|
time -= 1;
|
|
}
|
|
|
|
let unix_stream = CloneableUnixStream::new(unix_stream.unwrap());
|
|
let mut qmp = Qmp::from_stream(unix_stream.clone());
|
|
|
|
let handshake = qmp.handshake()?;
|
|
|
|
let mut control_socket = ControlSocket {
|
|
unix_stream,
|
|
qmp,
|
|
_info: handshake,
|
|
};
|
|
|
|
self.pin_qemu_threads()?;
|
|
|
|
if self.config.looking_glass.enabled {
|
|
self.global_config
|
|
.vore
|
|
.chown(&self.config.looking_glass.mem_path)?;
|
|
}
|
|
|
|
if self.config.spice.enabled {
|
|
self.global_config
|
|
.vore
|
|
.chown(&self.config.spice.socket_path)?;
|
|
}
|
|
|
|
control_socket
|
|
.qmp
|
|
.execute(&qapi_qmp::cont {})
|
|
.context("Failed to send start command on qemu control socket")?;
|
|
|
|
control_socket.qmp.nop()?;
|
|
self.control_socket = Some(control_socket);
|
|
|
|
self.process_qmp_events()?;
|
|
|
|
Ok(())
|
|
};
|
|
|
|
let result_ = res();
|
|
if result_.is_err() {
|
|
if let Some(mut qemu) = self.process.take() {
|
|
let _ = qemu.kill();
|
|
qemu.wait()?;
|
|
}
|
|
}
|
|
|
|
result_
|
|
}
|
|
|
|
pub fn control_stream(&self) -> Option<&CloneableUnixStream> {
|
|
self.control_socket.as_ref().map(|x| &x.unix_stream)
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, Debug)]
|
|
pub struct CloneableUnixStream(Arc<Mutex<UnixStream>>);
|
|
|
|
impl CloneableUnixStream {
|
|
pub fn new(unix_stream: UnixStream) -> CloneableUnixStream {
|
|
CloneableUnixStream(Arc::new(Mutex::new(unix_stream)))
|
|
}
|
|
|
|
pub fn lock(&self) -> Result<MutexGuard<'_, UnixStream>, std::io::Error> {
|
|
self.0.lock().map_err(|_| {
|
|
io::Error::new(
|
|
ErrorKind::Other,
|
|
anyhow::anyhow!("Failed to lock UnixStream"),
|
|
)
|
|
})
|
|
}
|
|
}
|
|
|
|
impl AsRawFd for CloneableUnixStream {
|
|
fn as_raw_fd(&self) -> i32 {
|
|
self.lock().unwrap().as_raw_fd()
|
|
}
|
|
}
|
|
|
|
impl Read for CloneableUnixStream {
|
|
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
|
let res = self.lock()?.read(buf);
|
|
res
|
|
}
|
|
}
|
|
|
|
impl Write for CloneableUnixStream {
|
|
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
|
self.lock()?.write(buf)
|
|
}
|
|
|
|
fn flush(&mut self) -> io::Result<()> {
|
|
self.lock()?.flush()
|
|
}
|
|
}
|