imago/
file.rs

1//! Use a plain file or host block device as storage.
2
3#[cfg(unix)]
4use crate::io_buffers::IoBuffer;
5use crate::io_buffers::{IoVector, IoVectorMut};
6#[cfg(unix)]
7use crate::misc_helpers::while_eintr;
8use crate::misc_helpers::ResultErrorContext;
9use crate::storage::drivers::CommonStorageHelper;
10use crate::storage::ext::write_full_zeroes;
11use crate::storage::PreallocateMode;
12use crate::{Storage, StorageCreateOptions, StorageOpenOptions};
13use cfg_if::cfg_if;
14use std::fmt::{self, Display, Formatter};
15use std::io::{self, Write};
16#[cfg(any(target_os = "linux", target_os = "macos"))]
17use std::os::fd::AsRawFd;
18#[cfg(unix)]
19use std::os::unix::fs::FileTypeExt;
20#[cfg(all(unix, not(target_os = "macos")))]
21use std::os::unix::fs::OpenOptionsExt;
22#[cfg(windows)]
23use std::os::windows::fs::{FileExt, OpenOptionsExt};
24#[cfg(windows)]
25use std::os::windows::io::AsRawHandle;
26use std::path::{Path, PathBuf};
27use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
28use std::sync::RwLock;
29use std::{cmp, fs};
30#[cfg(unix)]
31use tracing::{debug, warn};
32#[cfg(windows)]
33use windows_sys::Win32::System::Ioctl::{FILE_ZERO_DATA_INFORMATION, FSCTL_SET_ZERO_DATA};
34#[cfg(windows)]
35use windows_sys::Win32::System::IO::DeviceIoControl;
36
37/// Use a plain file or host block device as a storage object.
38#[derive(Debug)]
39pub struct File {
40    /// The file.
41    file: RwLock<fs::File>,
42
43    /// For debug purposes, and to resolve relative filenames.
44    filename: Option<PathBuf>,
45
46    /// Minimal I/O alignment for requests.
47    req_align: usize,
48
49    /// Minimal memory buffer alignment.
50    mem_align: usize,
51
52    /// Minimum required alignment for zero writes.
53    zero_align: usize,
54
55    /// Minimum required alignment for effective discards.
56    discard_align: usize,
57
58    /// Cached file length.
59    ///
60    /// Third parties changing the length concurrently is pretty certain to break things anyway.
61    size: AtomicU64,
62
63    /// Storage helper.
64    common_storage_helper: CommonStorageHelper,
65
66    /// macOS-only: Use fsync() instead of F_FULLFSYNC on `sync()` method.
67    #[cfg(target_os = "macos")]
68    relaxed_sync: bool,
69
70    /// Set once we know that discard is unsupported and we can skip trying.
71    discard_unsupported: AtomicBool,
72}
73
74impl TryFrom<fs::File> for File {
75    type Error = io::Error;
76
77    /// Use the given existing `std::fs::File`.
78    ///
79    /// Convert the given existing `std::fs::File` object into an imago storage object.
80    ///
81    /// When using this, the resulting object will not know its own filename.  That makes it
82    /// impossible to auto-resolve relative paths to it, e.g. qcow2 backing file names.
83    fn try_from(file: fs::File) -> io::Result<Self> {
84        Self::new(
85            file,
86            None,
87            false,
88            #[cfg(target_os = "macos")]
89            false,
90        )
91    }
92}
93
94impl Storage for File {
95    async fn open(opts: StorageOpenOptions) -> io::Result<Self> {
96        Self::do_open_sync(opts, fs::OpenOptions::new())
97    }
98
99    #[cfg(feature = "sync-wrappers")]
100    fn open_sync(opts: StorageOpenOptions) -> io::Result<Self> {
101        Self::do_open_sync(opts, fs::OpenOptions::new())
102    }
103
104    async fn create_open(opts: StorageCreateOptions) -> io::Result<Self> {
105        // Always allow writing for new files
106        let opts = opts.modify_open_opts(|o| o.write(true));
107        let size = opts.size;
108        let prealloc_mode = opts.prealloc_mode;
109
110        let mut file_opts = fs::OpenOptions::new();
111        if opts.overwrite {
112            file_opts.create(true).truncate(true);
113        } else {
114            file_opts.create_new(true);
115        };
116
117        let file = Self::do_open_sync(opts.get_open_options(), file_opts)?;
118        if size > 0 {
119            file.resize(size, prealloc_mode)
120                .await
121                .err_context(|| "Resizing file")?;
122        }
123
124        Ok(file)
125    }
126
127    fn mem_align(&self) -> usize {
128        self.mem_align
129    }
130
131    fn req_align(&self) -> usize {
132        self.req_align
133    }
134
135    fn zero_align(&self) -> usize {
136        cmp::max(self.zero_align, self.req_align)
137    }
138
139    fn discard_align(&self) -> usize {
140        cmp::max(self.discard_align, self.req_align)
141    }
142
143    fn size(&self) -> io::Result<u64> {
144        Ok(self.size.load(Ordering::Relaxed))
145    }
146
147    fn resolve_relative_path<P: AsRef<Path>>(&self, relative: P) -> io::Result<PathBuf> {
148        let relative = relative.as_ref();
149
150        if relative.is_absolute() {
151            return Ok(relative.to_path_buf());
152        }
153
154        let filename = self
155            .filename
156            .as_ref()
157            .ok_or_else(|| io::Error::other("No filename set for base image"))?;
158
159        let dirname = filename
160            .parent()
161            .ok_or_else(|| io::Error::other("Invalid base image filename set"))?;
162
163        Ok(dirname.join(relative))
164    }
165
166    fn get_filename(&self) -> Option<PathBuf> {
167        self.filename.as_ref().cloned()
168    }
169
170    #[cfg(unix)]
171    async unsafe fn pure_readv(
172        &self,
173        mut bufv: IoVectorMut<'_>,
174        mut offset: u64,
175    ) -> io::Result<()> {
176        while !bufv.is_empty() {
177            let iovec = unsafe { bufv.as_iovec() };
178            let preadv_offset = offset
179                .try_into()
180                .map_err(|_| io::Error::other("Read offset overflow"))?;
181
182            let len = while_eintr(|| unsafe {
183                libc::preadv(
184                    self.file.read().unwrap().as_raw_fd(),
185                    iovec.as_ptr(),
186                    iovec.len() as libc::c_int,
187                    preadv_offset,
188                )
189            })? as u64;
190
191            if len == 0 {
192                // End of file
193                bufv.fill(0);
194                break;
195            }
196
197            bufv = bufv.split_tail_at(len);
198            offset = offset
199                .checked_add(len)
200                .ok_or_else(|| io::Error::other("Read offset overflow"))?;
201        }
202
203        Ok(())
204    }
205
206    #[cfg(windows)]
207    async unsafe fn pure_readv(&self, bufv: IoVectorMut<'_>, mut offset: u64) -> io::Result<()> {
208        for mut buffer in bufv.into_inner() {
209            let mut buffer: &mut [u8] = &mut buffer;
210            while !buffer.is_empty() {
211                let len = if offset >= self.size.load(Ordering::Relaxed) {
212                    buffer.fill(0);
213                    buffer.len()
214                } else {
215                    self.file.write().unwrap().seek_read(buffer, offset)?
216                };
217                offset = offset
218                    .checked_add(len as u64)
219                    .ok_or_else(|| io::Error::other("Read offset overflow"))?;
220                buffer = buffer.split_at_mut(len).1;
221            }
222        }
223        Ok(())
224    }
225
226    #[cfg(unix)]
227    async unsafe fn pure_writev(&self, mut bufv: IoVector<'_>, mut offset: u64) -> io::Result<()> {
228        while !bufv.is_empty() {
229            let iovec = unsafe { bufv.as_iovec() };
230            let pwritev_offset = offset
231                .try_into()
232                .map_err(|_| io::Error::other("Write offset overflow"))?;
233
234            let len = while_eintr(|| unsafe {
235                libc::pwritev(
236                    self.file.read().unwrap().as_raw_fd(),
237                    iovec.as_ptr(),
238                    iovec.len() as libc::c_int,
239                    pwritev_offset,
240                )
241            })? as u64;
242
243            if len == 0 {
244                // Should not happen, i.e. is an error
245                return Err(io::ErrorKind::WriteZero.into());
246            }
247
248            bufv = bufv.split_tail_at(len);
249            offset = offset
250                .checked_add(len)
251                .ok_or_else(|| io::Error::other("Write offset overflow"))?;
252            self.size.fetch_max(offset, Ordering::Relaxed);
253        }
254
255        Ok(())
256    }
257
258    #[cfg(windows)]
259    async unsafe fn pure_writev(&self, bufv: IoVector<'_>, mut offset: u64) -> io::Result<()> {
260        for buffer in bufv.into_inner() {
261            let mut buffer: &[u8] = &buffer;
262            while !buffer.is_empty() {
263                let len = self.file.write().unwrap().seek_write(buffer, offset)?;
264                offset = offset
265                    .checked_add(len as u64)
266                    .ok_or_else(|| io::Error::other("Write offset overflow"))?;
267                self.size.fetch_max(offset, Ordering::Relaxed);
268                buffer = buffer.split_at(len).1;
269            }
270        }
271        Ok(())
272    }
273
274    async unsafe fn pure_write_zeroes(&self, offset: u64, length: u64) -> io::Result<()> {
275        self.discard_to_zero(offset, length).await
276    }
277
278    #[cfg(target_os = "linux")]
279    async unsafe fn pure_write_allocated_zeroes(&self, offset: u64, length: u64) -> io::Result<()> {
280        let offset: libc::off_t = offset
281            .try_into()
282            .map_err(|e| io::Error::other(format!("Discard/write-zeroes offset error: {e}")))?;
283        let length: libc::off_t = length
284            .try_into()
285            .map_err(|e| io::Error::other(format!("Discard/write-zeroes length error: {e}")))?;
286
287        let file = self.file.read().unwrap();
288        // Safe: File descriptor is valid, and the rest are simple integer parameters.
289        while_eintr(|| unsafe {
290            libc::fallocate(file.as_raw_fd(), libc::FALLOC_FL_ZERO_RANGE, offset, length)
291        })
292        .map_err(Self::map_os_err)?;
293
294        Ok(())
295    }
296
297    async unsafe fn pure_discard(&self, offset: u64, length: u64) -> io::Result<()> {
298        if let Err(err) = self.discard_to_zero(offset, length).await {
299            // Ignore `Unsupported` errors: As per the `pure_discard` documentation, a no-op
300            // implementation is acceptable.  In addition, the default implementation returns
301            // `Ok(())`, and it makes no sense to be harsher than that here.
302            if err.kind() == io::ErrorKind::Unsupported {
303                Ok(())
304            } else {
305                Err(err)
306            }
307        } else {
308            Ok(())
309        }
310    }
311
312    async fn flush(&self) -> io::Result<()> {
313        self.file.write().unwrap().flush()
314    }
315
316    async fn sync(&self) -> io::Result<()> {
317        #[cfg(target_os = "macos")]
318        if self.relaxed_sync {
319            // Safe: File descriptor is valid and there aren't any other arguments.
320            while_eintr(|| unsafe { libc::fsync(self.file.write().unwrap().as_raw_fd()) })?;
321            return Ok(());
322        }
323        self.file.write().unwrap().sync_all()
324    }
325
326    async unsafe fn invalidate_cache(&self) -> io::Result<()> {
327        // TODO: Figure out what to do.  Generally, `std::fs::File` does not have internal buffers,
328        // so we don’t need to invalidate anything; we could close and reopen, but that would still
329        // flush, and is difficult to do in a platform-independent way (/proc/self/fd would allow
330        // this on Linux).  Using e.g. the filename is not safe.
331        // Right now, it’s best not to do anything.
332        Ok(())
333    }
334
335    fn get_storage_helper(&self) -> &CommonStorageHelper {
336        &self.common_storage_helper
337    }
338
339    async fn resize(&self, new_size: u64, prealloc_mode: PreallocateMode) -> io::Result<()> {
340        let file = self.file.write().unwrap();
341        let current_size = self.size.load(Ordering::Relaxed);
342
343        match new_size.cmp(&current_size) {
344            std::cmp::Ordering::Equal => return Ok(()),
345            std::cmp::Ordering::Less => {
346                file.set_len(new_size)?;
347                self.size.fetch_min(new_size, Ordering::Relaxed);
348                return Ok(());
349            }
350            std::cmp::Ordering::Greater => (), // handled below
351        }
352
353        match prealloc_mode {
354            PreallocateMode::None | PreallocateMode::Zero => file.set_len(new_size)?,
355            PreallocateMode::Allocate => {
356                #[cfg(not(unix))]
357                return Err(io::ErrorKind::Unsupported.into());
358
359                #[cfg(all(unix, not(target_os = "macos")))]
360                {
361                    let ofs = current_size.try_into().map_err(io::Error::other)?;
362                    let len = (new_size - current_size)
363                        .try_into()
364                        .map_err(io::Error::other)?;
365                    while_eintr(|| unsafe { libc::fallocate(file.as_raw_fd(), 0, ofs, len) })
366                        .map_err(Self::map_os_err)?;
367                }
368
369                #[cfg(target_os = "macos")]
370                {
371                    // Best-effort.  PEOFPOSMODE allocates from the “physical” EOF, wherever that
372                    // may be, but the only alternative would be VOLPOSMODE, which nobody knows the
373                    // meaning of.  Also doesn’t change the file length, we need to truncate
374                    // afterwards still.
375                    let mut params = libc::fstore_t {
376                        fst_flags: libc::F_ALLOCATEALL,
377                        fst_posmode: libc::F_PEOFPOSMODE,
378                        fst_offset: 0,
379                        fst_length: (new_size - current_size)
380                            .try_into()
381                            .map_err(io::Error::other)?,
382                        fst_bytesalloc: 0, // output
383                    };
384                    while_eintr(|| unsafe {
385                        libc::fcntl(file.as_raw_fd(), libc::F_PREALLOCATE, &mut params)
386                    })
387                    .map_err(Self::map_os_err)?;
388
389                    file.set_len(new_size)?;
390                }
391            }
392            PreallocateMode::WriteData => {
393                // FIXME: Keeping the lock would be nice, but resizing concurrently with I/O is
394                // pretty risky anyway.
395                drop(file);
396                write_full_zeroes(self, current_size, new_size - current_size).await?;
397            }
398        }
399
400        self.size.fetch_max(new_size, Ordering::Relaxed);
401        Ok(())
402    }
403}
404
405impl File {
406    /// Central internal function to create a `File` object.
407    ///
408    /// `direct_io` should be `true` if direct I/O was requested, and can be `false` if that status
409    /// is unknown.
410    fn new(
411        mut file: fs::File,
412        filename: Option<PathBuf>,
413        direct_io: bool,
414        #[cfg(target_os = "macos")] relaxed_sync: bool,
415    ) -> io::Result<Self> {
416        let size = get_file_size(&file).err_context(|| "Failed to determine file size")?;
417
418        #[cfg(all(unix, not(target_os = "macos")))]
419        let direct_io = direct_io || {
420            // Safe: No argument, returns result.
421            let res = unsafe { libc::fcntl(file.as_raw_fd(), libc::F_GETFL) };
422            res > 0 && (res & libc::O_DIRECT) != 0
423        };
424
425        let (min_req_align, min_mem_align) = if direct_io {
426            #[cfg(unix)]
427            {
428                (
429                    Self::get_min_dio_req_align(&file),
430                    Self::get_min_dio_mem_align(&file),
431                )
432            }
433
434            #[cfg(not(unix))]
435            {
436                (1, 1)
437            } // probe it then
438        } else {
439            (1, 1)
440        };
441
442        let (req_align, mem_align, zero_align, discard_align) =
443            Self::probe_alignments(&mut file, min_req_align, min_mem_align);
444        assert!(req_align.is_power_of_two());
445        assert!(mem_align.is_power_of_two());
446
447        Ok(File {
448            file: RwLock::new(file),
449            filename,
450            req_align,
451            mem_align,
452            zero_align,
453            discard_align,
454            size: size.into(),
455            common_storage_helper: Default::default(),
456            #[cfg(target_os = "macos")]
457            relaxed_sync,
458            discard_unsupported: AtomicBool::new(false),
459        })
460    }
461
462    /// Probe minimal request, memory, zero and discard alignments.
463    ///
464    /// Start at `min_req_align` and `min_mem_align`.
465    #[cfg(unix)]
466    fn probe_alignments(
467        file: &mut fs::File,
468        min_req_align: usize,
469        min_mem_align: usize,
470    ) -> (usize, usize, usize, usize) {
471        let mut page_size = page_size::get();
472        if !page_size.is_power_of_two() {
473            let assume = page_size.checked_next_power_of_two().unwrap_or(4096);
474            let assume = cmp::max(4096, assume);
475            warn!("Reported page size of {page_size} is not a power of two, assuming {assume}");
476            page_size = assume;
477        }
478
479        #[cfg(any(target_os = "linux", target_os = "macos"))]
480        let (mut zero_align, mut discard_align) = {
481            let mut statfs: libc::statfs = unsafe { std::mem::zeroed() };
482            // Safe: FD is valid, passed pointer is valid and its type matches the call.
483            match while_eintr(|| unsafe { libc::fstatfs(file.as_raw_fd(), &mut statfs) }) {
484                // On macOS, `f_bsize` is the fundamental block size.  On Linux, `f_bsize` is the
485                // optimal transfer block size and `f_frsize` is the actual block size.
486                #[cfg(target_os = "linux")]
487                Ok(_) => (statfs.f_frsize as usize, statfs.f_frsize as usize),
488                #[cfg(target_os = "macos")]
489                Ok(_) => (statfs.f_bsize as usize, statfs.f_bsize as usize),
490
491                Err(_) => (page_size, page_size),
492            }
493        };
494
495        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
496        let (mut zero_align, mut discard_align) = (page_size, page_size);
497
498        // Double-check to make absolutely sure both are powers of two
499        if !zero_align.is_power_of_two() {
500            zero_align = page_size;
501        }
502        if !discard_align.is_power_of_two() {
503            discard_align = page_size;
504        }
505
506        let mut writable = true;
507
508        let max_req_align = 65536;
509        let max_mem_align = cmp::max(page_size, max_req_align);
510
511        // Minimum fallbacks in case something goes wrong.
512        let safe_req_align = 4096;
513        let safe_mem_align = cmp::max(page_size, safe_req_align);
514
515        let mut test_buf = match IoBuffer::new(max_mem_align, max_mem_align) {
516            Ok(buf) => buf,
517            Err(err) => {
518                warn!(
519                    "Failed to allocate memory to probe request alignment ({err}), \
520                    falling back to {safe_req_align}/{safe_mem_align}"
521                );
522                return (safe_req_align, safe_mem_align, zero_align, discard_align);
523            }
524        };
525
526        let mut req_align: usize = min_req_align;
527        let result = loop {
528            assert!(req_align <= max_mem_align);
529            match Self::probe_access(
530                file,
531                test_buf.as_mut_range(0..req_align).into_slice(),
532                req_align.try_into().unwrap(),
533                &mut writable,
534            ) {
535                Ok(true) => break Ok(req_align),
536                Ok(false) => {
537                    if req_align >= max_req_align {
538                        break Err(io::Error::other(format!(
539                            "Maximum I/O alignment ({max_req_align}) exceeded"
540                        )));
541                    }
542                    // No reason to probe anything between 1 and 512
543                    if req_align == min_req_align {
544                        req_align = cmp::max(min_req_align << 1, 512);
545                    } else {
546                        req_align <<= 1;
547                    }
548                }
549                Err(err) => break Err(err),
550            }
551        };
552
553        let req_align = match result {
554            Ok(align) => {
555                debug!("Probed request alignment: {align}");
556                align
557            }
558            Err(err) => {
559                // Failed to determine request alignment, use a presumably safe value
560                let align = cmp::max(req_align, safe_req_align);
561                warn!(
562                    "Failed to probe request alignment ({err}; {}), falling back to {align} bytes",
563                    err.kind(),
564                );
565                align
566            }
567        };
568
569        let mut mem_align: usize = min_mem_align;
570        let result = loop {
571            assert!(mem_align <= max_mem_align);
572            let range = (max_mem_align - mem_align)..max_mem_align;
573            match Self::probe_access(
574                file,
575                test_buf.as_mut_range(range).into_slice(),
576                0,
577                &mut writable,
578            ) {
579                Ok(true) => break Ok(mem_align),
580                Ok(false) => {
581                    // Not aligned
582                    if mem_align >= max_mem_align {
583                        break Err(io::Error::other(format!(
584                            "Maximum memory alignment ({max_mem_align}) exceeded"
585                        )));
586                    }
587                    // No reason to probe anything between 1 and the page size (or 4096 at least)
588                    if mem_align == min_mem_align {
589                        mem_align = cmp::max(min_mem_align << 1, cmp::min(page_size, 4096));
590                    } else {
591                        mem_align <<= 1;
592                    }
593                }
594                Err(err) => break Err(err),
595            }
596        };
597
598        let mem_align = match result {
599            Ok(align) => {
600                debug!("Probed memory alignment: {align}");
601                align
602            }
603            Err(err) => {
604                // Failed to determine memory alignment, use a presumably safe value
605                let align = cmp::max(mem_align, safe_mem_align);
606                warn!(
607                    "Failed to probe memory alignment ({err}; {}), falling back to {align} bytes",
608                    err.kind(),
609                );
610                align
611            }
612        };
613
614        (req_align, mem_align, zero_align, discard_align)
615    }
616
617    /// Do an alignment-probing I/O access.
618    ///
619    /// Return `Ok(true)` if everything was OK, and `Ok(false)` if the request was reported to be
620    /// misaligned.
621    ///
622    /// `may_write` is a boolean that controls whether this is allowed to write (the same data read
623    /// before) to improve reliability.  Is automatically set to `false` if writing is found to not
624    /// be possible.
625    #[cfg(unix)]
626    fn probe_access(
627        file: &mut fs::File,
628        slice: &mut [u8],
629        offset: libc::off_t,
630        may_write: &mut bool,
631    ) -> io::Result<bool> {
632        // Use `libc::pread` so we get well-defined errors.
633        // Safe: Passing the slice as the buffer it is.
634        let ret = while_eintr(|| unsafe {
635            libc::pread(
636                file.as_raw_fd(),
637                slice.as_mut_ptr() as *mut libc::c_void,
638                slice.len(),
639                offset,
640            )
641        });
642
643        if let Err(err) = ret {
644            if err.raw_os_error() == Some(libc::EINVAL) {
645                return Ok(false);
646            } else {
647                return Err(err);
648            }
649        }
650
651        if !*may_write {
652            return Ok(true);
653        }
654
655        // Safe: Passing the slice as the buffer it is.
656        let ret = while_eintr(|| unsafe {
657            libc::pwrite(
658                file.as_raw_fd(),
659                slice.as_ptr() as *const libc::c_void,
660                slice.len(),
661                offset,
662            )
663        });
664
665        if let Err(err) = ret {
666            if err.raw_os_error() == Some(libc::EINVAL) {
667                Ok(false)
668            } else if err.raw_os_error() == Some(libc::EBADF) {
669                *may_write = false;
670                Ok(true)
671            } else {
672                Err(err)
673            }
674        } else {
675            Ok(true)
676        }
677    }
678
679    /// Get system-reported minimum request alignment for direct I/O.
680    #[cfg(unix)]
681    fn get_min_dio_req_align(file: &fs::File) -> usize {
682        #[cfg(target_os = "linux")]
683        {
684            let mut alignment = 0;
685            let res = unsafe { ioctl::blksszget(file.as_raw_fd(), &mut alignment) };
686            if res.is_ok() && alignment > 0 {
687                let alignment = alignment as usize;
688                if alignment.is_power_of_two() {
689                    return alignment;
690                }
691            }
692        }
693
694        #[cfg(target_os = "macos")]
695        {
696            let mut alignment = 0;
697            let res = unsafe { ioctl::dkiocgetblocksize(file.as_raw_fd(), &mut alignment) };
698            if res.is_ok() && alignment.is_power_of_two() {
699                return alignment as usize;
700            }
701        }
702
703        #[cfg(target_os = "freebsd")]
704        {
705            let mut alignment = 0;
706            let res = unsafe { ioctl::diocgsectorsize(file.as_raw_fd(), &mut alignment) };
707            if res.is_ok() && alignment.is_power_of_two() {
708                return alignment as usize;
709            }
710        }
711
712        // Then we’ll probe.
713        1
714    }
715
716    /// Get system-reported minimum memory alignment for direct I/O.
717    #[cfg(unix)]
718    fn get_min_dio_mem_align(_file: &fs::File) -> usize {
719        // I don’t think there’s a reliable way to get this.
720        1
721    }
722
723    /// Probe minimal request and memory alignments.
724    ///
725    /// Start at `min_req_align` and `min_mem_align`.
726    #[cfg(windows)]
727    fn probe_alignments(
728        _file: &mut fs::File,
729        min_req_align: usize,
730        min_mem_align: usize,
731    ) -> (usize, usize, usize, usize) {
732        // TODO: Need to find out how Windows indicates unaligned I/O
733        (
734            cmp::max(min_req_align, 4096),
735            cmp::max(min_mem_align, 4096),
736            1,
737            1,
738        )
739    }
740
741    /// Implementation for anything that opens a file.
742    fn do_open_sync(opts: StorageOpenOptions, base_fs_opts: fs::OpenOptions) -> io::Result<Self> {
743        let Some(filename) = opts.filename else {
744            return Err(io::Error::new(
745                io::ErrorKind::InvalidInput,
746                "Filename required",
747            ));
748        };
749
750        let mut file_opts = base_fs_opts;
751        file_opts.read(true).write(opts.writable);
752        #[cfg(not(target_os = "macos"))]
753        if opts.direct {
754            file_opts.custom_flags(
755                #[cfg(unix)]
756                libc::O_DIRECT,
757                #[cfg(windows)]
758                windows_sys::Win32::Storage::FileSystem::FILE_FLAG_NO_BUFFERING,
759            );
760        }
761
762        let filename_owned = filename.to_owned();
763        let file = file_opts.open(filename)?;
764
765        #[cfg(target_os = "macos")]
766        if opts.direct {
767            // Safe: We check the return value.
768            while_eintr(|| unsafe { libc::fcntl(file.as_raw_fd(), libc::F_NOCACHE, 1) })
769                .err_context(|| "Failed to disable host cache")?;
770        }
771
772        Self::new(
773            file,
774            Some(filename_owned),
775            opts.direct,
776            #[cfg(target_os = "macos")]
777            opts.relaxed_sync,
778        )
779    }
780
781    /// For special operations, ensure the error kind is usable.
782    ///
783    /// When invoking OS I/O operations directly, we turn the returned raw OS error code into an
784    /// `io::Error` object via `io::Error::last_os_error()`.  To differentiate between different
785    /// error cases, in generic imago code, we then don’t use that raw error code, but the error
786    /// kind (`io::ErrorKind`) instead, specifically it’s important to properly return an error of
787    /// kind `io::ErrorKind::Unsupported` when an operation is unsupported, so fall-backs can be
788    /// employed.
789    ///
790    /// Rust’s standard library only assigns this error kind (`Unsupported`) to `EOPNOTSUPP` (=
791    /// `ENOTSUP`) and `ENOSYS`.  However, some “special” operations (`fallocate()`,
792    /// `fcntl(F_PUNCHHOLE)`, `ioctl()`, ...) can return other error codes for when an operation is
793    /// not supported on a specific file, e.g. `ENODEV` or `ENXIO`.
794    ///
795    /// Assign the appropriate error kind to such errors so the generic code can handle them.
796    #[cfg(unix)]
797    fn map_os_err(err: io::Error) -> io::Error {
798        let Some(raw) = err.raw_os_error() else {
799            return err;
800        };
801
802        let has_kind = err.kind();
803        let want_kind = match raw {
804            #[allow(unreachable_patterns)] // `ENOTSUP` may be equal to `EOPNOTSUPP`
805            libc::ENOTSUP | libc::EOPNOTSUPP | libc::ENODEV | libc::ENXIO | libc::ENOTTY => {
806                io::ErrorKind::Unsupported
807            }
808            _ => has_kind,
809        };
810
811        if has_kind != want_kind {
812            io::Error::new(want_kind, err)
813        } else {
814            err
815        }
816    }
817
818    /// For special operations, ensure the error kind is usable.
819    ///
820    /// For non-UNIX systems, this is an identity map.
821    #[cfg(not(unix))]
822    fn map_os_err(err: io::Error) -> io::Error {
823        err
824    }
825
826    /// Attempt to discard range by truncating the file.
827    ///
828    /// If the given range is at the end of the file, discard it by simply truncating the file.
829    /// Return `true` on success.
830    ///
831    /// If the range is not at the end of the file, i.e. another method of discarding is needed,
832    /// return `false`.
833    fn try_discard_by_truncate(&self, offset: u64, length: u64) -> io::Result<bool> {
834        // Prevent modifications to the file length
835        #[allow(clippy::readonly_write_lock)]
836        let file = self.file.write().unwrap();
837
838        let size = self.size.load(Ordering::Relaxed);
839        if offset >= size {
840            // Nothing to do
841            return Ok(true);
842        }
843
844        // If `offset + length` overflows, we can just assume it ends at `size`.  (Anything past
845        // `size is irrelevant anyway.)
846        let end = offset.checked_add(length).unwrap_or(size);
847        if end < size {
848            return Ok(false);
849        }
850
851        file.set_len(offset)?;
852        Ok(true)
853    }
854
855    /// Ensure the given range reads back as zeroes, or return an error.
856    async fn discard_to_zero(&self, offset: u64, length: u64) -> io::Result<()> {
857        if self.try_discard_by_truncate(offset, length)? {
858            return Ok(());
859        }
860
861        if self.discard_unsupported.load(Ordering::Relaxed) {
862            Err(io::ErrorKind::Unsupported.into())
863        } else if let Err(err) = self.discard_to_zero_os_specific(offset, length).await {
864            if err.kind() == io::ErrorKind::Unsupported {
865                self.discard_unsupported.store(true, Ordering::Relaxed);
866            }
867            Err(err)
868        } else {
869            Ok(())
870        }
871    }
872
873    /// Via OS-specific means, ensure the given range reads back as zeroes, or return an error.
874    #[cfg(target_os = "linux")]
875    async fn discard_to_zero_os_specific(&self, offset: u64, length: u64) -> io::Result<()> {
876        let offset: libc::off_t = offset
877            .try_into()
878            .map_err(|e| io::Error::other(format!("Discard/write-zeroes offset error: {e}")))?;
879        let length: libc::off_t = length
880            .try_into()
881            .map_err(|e| io::Error::other(format!("Discard/write-zeroes length error: {e}")))?;
882
883        let file = self.file.read().unwrap();
884        // Safe: File descriptor is valid, and the rest are simple integer parameters.
885        while_eintr(|| unsafe {
886            libc::fallocate(
887                file.as_raw_fd(),
888                libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE,
889                offset,
890                length,
891            )
892        })
893        .map_err(Self::map_os_err)?;
894
895        Ok(())
896    }
897
898    /// Via OS-specific means, ensure the given range reads back as zeroes, or return an error.
899    #[cfg(windows)]
900    async fn discard_to_zero_os_specific(&self, offset: u64, length: u64) -> io::Result<()> {
901        let offset: i64 = offset
902            .try_into()
903            .map_err(|e| io::Error::other(format!("Discard/write-zeroes offset error: {e}")))?;
904        let length: i64 = length
905            .try_into()
906            .map_err(|e| io::Error::other(format!("Discard/write-zeroes length error: {e}")))?;
907
908        let end = offset.saturating_add(length).saturating_add(1);
909        let params = FILE_ZERO_DATA_INFORMATION {
910            FileOffset: offset,
911            BeyondFinalZero: end,
912        };
913        let mut _returned = 0;
914        let file = self.file.read().unwrap();
915        // Safe: File handle is valid, mandatory pointers (input, returned length) are passed and
916        // valid, the parameter type matches the call, and the input size matches the object
917        // passed.
918        let ret = unsafe {
919            DeviceIoControl(
920                file.as_raw_handle(),
921                FSCTL_SET_ZERO_DATA,
922                (&params as *const FILE_ZERO_DATA_INFORMATION).cast::<std::ffi::c_void>(),
923                size_of_val(&params) as u32,
924                std::ptr::null_mut(),
925                0,
926                &mut _returned,
927                std::ptr::null_mut(),
928            )
929        };
930        if ret == 0 {
931            return Err(Self::map_os_err(io::Error::last_os_error()));
932        }
933
934        Ok(())
935    }
936
937    /// Via OS-specific means, ensure the given range reads back as zeroes, or return an error.
938    #[cfg(target_os = "macos")]
939    async fn discard_to_zero_os_specific(&self, offset: u64, length: u64) -> io::Result<()> {
940        let offset: libc::off_t = offset
941            .try_into()
942            .map_err(|e| io::Error::other(format!("Discard/write-zeroes offset error: {e}")))?;
943        let length: libc::off_t = length
944            .try_into()
945            .map_err(|e| io::Error::other(format!("Discard/write-zeroes length error: {e}")))?;
946
947        let params = libc::fpunchhole_t {
948            fp_flags: 0,
949            reserved: 0,
950            fp_offset: offset,
951            fp_length: length,
952        };
953        let file = self.file.read().unwrap();
954        // Safe: FD is valid, passed pointer is valid and its type matches the call.
955        while_eintr(|| unsafe { libc::fcntl(file.as_raw_fd(), libc::F_PUNCHHOLE, &params) })
956            .map_err(Self::map_os_err)?;
957
958        Ok(())
959    }
960
961    /// Via OS-specific means, ensure the given range reads back as zeroes, or return an error.
962    #[cfg(not(any(target_os = "linux", target_os = "macos", windows)))]
963    async fn discard_to_zero_os_specific(&self, offset: u64, length: u64) -> io::Result<()> {
964        Err(io::ErrorKind::Unsupported.into())
965    }
966}
967
968impl Display for File {
969    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
970        if let Some(filename) = self.filename.as_ref() {
971            write!(f, "file:{filename:?}")
972        } else {
973            write!(f, "file:<unknown path>")
974        }
975    }
976}
977
978/// Get total size in bytes of the given file.
979///
980/// If the file is a block or character device, use get_device_size() instead of
981/// reading len from metadata which doesn't work on some platforms like macOS.
982fn get_file_size(file: &fs::File) -> io::Result<u64> {
983    #[allow(clippy::bind_instead_of_map)]
984    file.metadata().and_then(|m| {
985        #[cfg(unix)]
986        if m.file_type().is_block_device() || m.file_type().is_char_device() {
987            return get_device_size(file);
988        }
989        Ok(m.len())
990    })
991}
992
993cfg_if! {
994    if #[cfg(target_os = "linux")] {
995        /// Get total size in bytes of the given block or character device.
996        fn get_device_size(file: &fs::File) -> io::Result<u64> {
997            let mut size = 0;
998            unsafe { ioctl::blkgetsize64(file.as_raw_fd(), &mut size) }?;
999            Ok(size)
1000        }
1001    } else if #[cfg(target_os = "macos")] {
1002        /// Get total size in bytes of the given block or character device.
1003        fn get_device_size(file: &fs::File) -> io::Result<u64> {
1004            let mut block_size = 0;
1005            unsafe { ioctl::dkiocgetblocksize(file.as_raw_fd(), &mut block_size) }?;
1006            let mut block_count = 0;
1007            unsafe { ioctl::dkiocgetblockcount(file.as_raw_fd(), &mut block_count) }?;
1008            Ok(u64::from(block_size) * block_count)
1009        }
1010    } else if #[cfg(target_os = "freebsd")] {
1011        /// Get total size in bytes of the given block or character device.
1012        fn get_device_size(file: &fs::File) -> io::Result<u64> {
1013            let mut size = 0;
1014            unsafe { ioctl::diocgmediasize(file.as_raw_fd(), &mut size) }?;
1015            Ok(size as u64)
1016        }
1017    } else if #[cfg(unix)] {
1018        /// Get total size in bytes of the given block or character device - unsupported platform.
1019        fn get_device_size(_file: &fs::File) -> io::Result<u64> {
1020            Err(io::ErrorKind::Unsupported.into())
1021        }
1022    }
1023}
1024
1025/// This module generates type-safe wrappers for chosen ioctls
1026mod ioctl {
1027    #[cfg(unix)]
1028    use nix::ioctl_read;
1029    #[cfg(target_os = "linux")]
1030    use nix::ioctl_read_bad;
1031
1032    // https://github.com/torvalds/linux/blob/master/include/uapi/linux/fs.h#L200
1033
1034    #[cfg(target_os = "linux")]
1035    ioctl_read!(blkgetsize64, 0x12, 114, u64);
1036
1037    #[cfg(target_os = "linux")]
1038    ioctl_read_bad!(blksszget, libc::BLKSSZGET, libc::c_int);
1039
1040    // https://github.com/apple-oss-distributions/xnu/blob/main/bsd/sys/disk.h#L198-L199
1041
1042    #[cfg(target_os = "macos")]
1043    ioctl_read!(dkiocgetblocksize, 'd', 24, u32);
1044
1045    #[cfg(target_os = "macos")]
1046    ioctl_read!(dkiocgetblockcount, 'd', 25, u64);
1047
1048    // https://web.mit.edu/freebsd/head/sys/sys/disk.h
1049
1050    #[cfg(target_os = "freebsd")]
1051    ioctl_read!(diocgsectorsize, 'd', 128, libc::c_uint);
1052
1053    #[cfg(target_os = "freebsd")]
1054    ioctl_read!(diocgmediasize, 'd', 129, libc::off_t);
1055}