imago/
file.rs

1//! Use a plain file or host block device as storage.
2
3#[cfg(unix)]
4use crate::io_buffers::IoBuffer;
5use crate::io_buffers::{IoVector, IoVectorMut};
6#[cfg(unix)]
7use crate::misc_helpers::while_eintr;
8use crate::misc_helpers::ResultErrorContext;
9use crate::storage::drivers::CommonStorageHelper;
10use crate::storage::ext::write_full_zeroes;
11use crate::storage::PreallocateMode;
12use crate::{Storage, StorageCreateOptions, StorageOpenOptions};
13use cfg_if::cfg_if;
14use std::fmt::{self, Display, Formatter};
15use std::io::{self, Write};
16#[cfg(any(target_os = "linux", target_os = "macos"))]
17use std::os::fd::AsRawFd;
18#[cfg(unix)]
19use std::os::unix::fs::FileTypeExt;
20#[cfg(all(unix, not(target_os = "macos")))]
21use std::os::unix::fs::OpenOptionsExt;
22#[cfg(windows)]
23use std::os::windows::fs::{FileExt, OpenOptionsExt};
24#[cfg(windows)]
25use std::os::windows::io::AsRawHandle;
26use std::path::{Path, PathBuf};
27use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
28use std::sync::RwLock;
29use std::{cmp, fs};
30#[cfg(unix)]
31use tracing::{debug, warn};
32#[cfg(windows)]
33use windows_sys::Win32::System::Ioctl::{FILE_ZERO_DATA_INFORMATION, FSCTL_SET_ZERO_DATA};
34#[cfg(windows)]
35use windows_sys::Win32::System::IO::DeviceIoControl;
36
37/// Use a plain file or host block device as a storage object.
38#[derive(Debug)]
39pub struct File {
40    /// The file.
41    file: RwLock<fs::File>,
42
43    /// For debug purposes, and to resolve relative filenames.
44    filename: Option<PathBuf>,
45
46    /// Minimal I/O alignment for requests.
47    req_align: usize,
48
49    /// Minimal memory buffer alignment.
50    mem_align: usize,
51
52    /// Minimum required alignment for zero writes.
53    zero_align: usize,
54
55    /// Minimum required alignment for effective discards.
56    discard_align: usize,
57
58    /// Cached file length.
59    ///
60    /// Third parties changing the length concurrently is pretty certain to break things anyway.
61    size: AtomicU64,
62
63    /// Storage helper.
64    common_storage_helper: CommonStorageHelper,
65
66    /// macOS-only: Use fsync() instead of F_FULLFSYNC on `sync()` method.
67    #[cfg(target_os = "macos")]
68    relaxed_sync: bool,
69
70    /// Set once we know that discard is unsupported and we can skip trying.
71    discard_unsupported: AtomicBool,
72}
73
74impl TryFrom<fs::File> for File {
75    type Error = io::Error;
76
77    /// Use the given existing `std::fs::File`.
78    ///
79    /// Convert the given existing `std::fs::File` object into an imago storage object.
80    ///
81    /// When using this, the resulting object will not know its own filename.  That makes it
82    /// impossible to auto-resolve relative paths to it, e.g. qcow2 backing file names.
83    fn try_from(file: fs::File) -> io::Result<Self> {
84        Self::new(
85            file,
86            None,
87            false,
88            #[cfg(target_os = "macos")]
89            false,
90        )
91    }
92}
93
94impl Storage for File {
95    async fn open(opts: StorageOpenOptions) -> io::Result<Self> {
96        Self::do_open_sync(opts, fs::OpenOptions::new())
97    }
98
99    #[cfg(feature = "sync-wrappers")]
100    fn open_sync(opts: StorageOpenOptions) -> io::Result<Self> {
101        Self::do_open_sync(opts, fs::OpenOptions::new())
102    }
103
104    async fn create_open(opts: StorageCreateOptions) -> io::Result<Self> {
105        // Always allow writing for new files
106        let opts = opts.modify_open_opts(|o| o.write(true));
107        let size = opts.size;
108        let prealloc_mode = opts.prealloc_mode;
109
110        let mut file_opts = fs::OpenOptions::new();
111        if opts.overwrite {
112            file_opts.create(true).truncate(true);
113        } else {
114            file_opts.create_new(true);
115        };
116
117        let file = Self::do_open_sync(opts.get_open_options(), file_opts)?;
118        if size > 0 {
119            file.resize(size, prealloc_mode)
120                .await
121                .err_context(|| "Resizing file")?;
122        }
123
124        Ok(file)
125    }
126
127    fn mem_align(&self) -> usize {
128        self.mem_align
129    }
130
131    fn req_align(&self) -> usize {
132        self.req_align
133    }
134
135    fn zero_align(&self) -> usize {
136        self.zero_align
137    }
138
139    fn discard_align(&self) -> usize {
140        self.discard_align
141    }
142
143    fn size(&self) -> io::Result<u64> {
144        Ok(self.size.load(Ordering::Relaxed))
145    }
146
147    fn resolve_relative_path<P: AsRef<Path>>(&self, relative: P) -> io::Result<PathBuf> {
148        let relative = relative.as_ref();
149
150        if relative.is_absolute() {
151            return Ok(relative.to_path_buf());
152        }
153
154        let filename = self
155            .filename
156            .as_ref()
157            .ok_or_else(|| io::Error::other("No filename set for base image"))?;
158
159        let dirname = filename
160            .parent()
161            .ok_or_else(|| io::Error::other("Invalid base image filename set"))?;
162
163        Ok(dirname.join(relative))
164    }
165
166    fn get_filename(&self) -> Option<PathBuf> {
167        self.filename.as_ref().cloned()
168    }
169
170    #[cfg(unix)]
171    async unsafe fn pure_readv(
172        &self,
173        mut bufv: IoVectorMut<'_>,
174        mut offset: u64,
175    ) -> io::Result<()> {
176        while !bufv.is_empty() {
177            let iovec = unsafe { bufv.as_iovec() };
178            let preadv_offset = offset
179                .try_into()
180                .map_err(|_| io::Error::other("Read offset overflow"))?;
181
182            let len = while_eintr(|| unsafe {
183                libc::preadv(
184                    self.file.read().unwrap().as_raw_fd(),
185                    iovec.as_ptr(),
186                    iovec.len() as libc::c_int,
187                    preadv_offset,
188                )
189            })? as u64;
190
191            if len == 0 {
192                // End of file
193                bufv.fill(0);
194                break;
195            }
196
197            bufv = bufv.split_tail_at(len);
198            offset = offset
199                .checked_add(len)
200                .ok_or_else(|| io::Error::other("Read offset overflow"))?;
201        }
202
203        Ok(())
204    }
205
206    #[cfg(windows)]
207    async unsafe fn pure_readv(&self, bufv: IoVectorMut<'_>, mut offset: u64) -> io::Result<()> {
208        for mut buffer in bufv.into_inner() {
209            let mut buffer: &mut [u8] = &mut buffer;
210            while !buffer.is_empty() {
211                let len = if offset >= self.size.load(Ordering::Relaxed) {
212                    buffer.fill(0);
213                    buffer.len()
214                } else {
215                    self.file.write().unwrap().seek_read(buffer, offset)?
216                };
217                offset = offset
218                    .checked_add(len as u64)
219                    .ok_or_else(|| io::Error::other("Read offset overflow"))?;
220                buffer = buffer.split_at_mut(len).1;
221            }
222        }
223        Ok(())
224    }
225
226    #[cfg(unix)]
227    async unsafe fn pure_writev(&self, mut bufv: IoVector<'_>, mut offset: u64) -> io::Result<()> {
228        while !bufv.is_empty() {
229            let iovec = unsafe { bufv.as_iovec() };
230            let pwritev_offset = offset
231                .try_into()
232                .map_err(|_| io::Error::other("Write offset overflow"))?;
233
234            let len = while_eintr(|| unsafe {
235                libc::pwritev(
236                    self.file.read().unwrap().as_raw_fd(),
237                    iovec.as_ptr(),
238                    iovec.len() as libc::c_int,
239                    pwritev_offset,
240                )
241            })? as u64;
242
243            if len == 0 {
244                // Should not happen, i.e. is an error
245                return Err(io::ErrorKind::WriteZero.into());
246            }
247
248            bufv = bufv.split_tail_at(len);
249            offset = offset
250                .checked_add(len)
251                .ok_or_else(|| io::Error::other("Write offset overflow"))?;
252            self.size.fetch_max(offset, Ordering::Relaxed);
253        }
254
255        Ok(())
256    }
257
258    #[cfg(windows)]
259    async unsafe fn pure_writev(&self, bufv: IoVector<'_>, mut offset: u64) -> io::Result<()> {
260        for buffer in bufv.into_inner() {
261            let mut buffer: &[u8] = &buffer;
262            while !buffer.is_empty() {
263                let len = self.file.write().unwrap().seek_write(buffer, offset)?;
264                offset = offset
265                    .checked_add(len as u64)
266                    .ok_or_else(|| io::Error::other("Write offset overflow"))?;
267                self.size.fetch_max(offset, Ordering::Relaxed);
268                buffer = buffer.split_at(len).1;
269            }
270        }
271        Ok(())
272    }
273
274    async unsafe fn pure_write_zeroes(&self, offset: u64, length: u64) -> io::Result<()> {
275        self.discard_to_zero(offset, length).await
276    }
277
278    #[cfg(target_os = "linux")]
279    async unsafe fn pure_write_allocated_zeroes(&self, offset: u64, length: u64) -> io::Result<()> {
280        let offset: libc::off_t = offset
281            .try_into()
282            .map_err(|e| io::Error::other(format!("Discard/write-zeroes offset error: {e}")))?;
283        let length: libc::off_t = length
284            .try_into()
285            .map_err(|e| io::Error::other(format!("Discard/write-zeroes length error: {e}")))?;
286
287        let file = self.file.read().unwrap();
288        // Safe: File descriptor is valid, and the rest are simple integer parameters.
289        while_eintr(|| unsafe {
290            libc::fallocate(file.as_raw_fd(), libc::FALLOC_FL_ZERO_RANGE, offset, length)
291        })
292        .map_err(Self::map_os_err)?;
293
294        Ok(())
295    }
296
297    async unsafe fn pure_discard(&self, offset: u64, length: u64) -> io::Result<()> {
298        if let Err(err) = self.discard_to_zero(offset, length).await {
299            // Ignore `Unsupported` errors: As per the `pure_discard` documentation, a no-op
300            // implementation is acceptable.  In addition, the default implementation returns
301            // `Ok(())`, and it makes no sense to be harsher than that here.
302            if err.kind() == io::ErrorKind::Unsupported {
303                Ok(())
304            } else {
305                Err(err)
306            }
307        } else {
308            Ok(())
309        }
310    }
311
312    async fn flush(&self) -> io::Result<()> {
313        self.file.write().unwrap().flush()
314    }
315
316    async fn sync(&self) -> io::Result<()> {
317        #[cfg(target_os = "macos")]
318        if self.relaxed_sync {
319            // Safe: File descriptor is valid and there aren't any other arguments.
320            while_eintr(|| unsafe { libc::fsync(self.file.write().unwrap().as_raw_fd()) })?;
321            return Ok(());
322        }
323        self.file.write().unwrap().sync_all()
324    }
325
326    async unsafe fn invalidate_cache(&self) -> io::Result<()> {
327        // TODO: Figure out what to do.  Generally, `std::fs::File` does not have internal buffers,
328        // so we don’t need to invalidate anything; we could close and reopen, but that would still
329        // flush, and is difficult to do in a platform-independent way (/proc/self/fd would allow
330        // this on Linux).  Using e.g. the filename is not safe.
331        // Right now, it’s best not to do anything.
332        Ok(())
333    }
334
335    fn get_storage_helper(&self) -> &CommonStorageHelper {
336        &self.common_storage_helper
337    }
338
339    async fn resize(&self, new_size: u64, prealloc_mode: PreallocateMode) -> io::Result<()> {
340        let file = self.file.write().unwrap();
341        let current_size = self.size.load(Ordering::Relaxed);
342
343        match new_size.cmp(&current_size) {
344            std::cmp::Ordering::Equal => return Ok(()),
345            std::cmp::Ordering::Less => {
346                file.set_len(new_size)?;
347                self.size.fetch_min(new_size, Ordering::Relaxed);
348                return Ok(());
349            }
350            std::cmp::Ordering::Greater => (), // handled below
351        }
352
353        match prealloc_mode {
354            PreallocateMode::None | PreallocateMode::Zero => file.set_len(new_size)?,
355            PreallocateMode::Allocate => {
356                #[cfg(not(unix))]
357                return Err(io::ErrorKind::Unsupported.into());
358
359                #[cfg(all(unix, not(target_os = "macos")))]
360                {
361                    let ofs = current_size.try_into().map_err(io::Error::other)?;
362                    let len = (new_size - current_size)
363                        .try_into()
364                        .map_err(io::Error::other)?;
365                    while_eintr(|| unsafe { libc::fallocate(file.as_raw_fd(), 0, ofs, len) })
366                        .map_err(Self::map_os_err)?;
367                }
368
369                #[cfg(target_os = "macos")]
370                {
371                    // Best-effort.  PEOFPOSMODE allocates from the “physical” EOF, wherever that
372                    // may be, but the only alternative would be VOLPOSMODE, which nobody knows the
373                    // meaning of.  Also doesn’t change the file length, we need to truncate
374                    // afterwards still.
375                    let mut params = libc::fstore_t {
376                        fst_flags: libc::F_ALLOCATEALL,
377                        fst_posmode: libc::F_PEOFPOSMODE,
378                        fst_offset: 0,
379                        fst_length: (new_size - current_size)
380                            .try_into()
381                            .map_err(io::Error::other)?,
382                        fst_bytesalloc: 0, // output
383                    };
384                    while_eintr(|| unsafe {
385                        libc::fcntl(file.as_raw_fd(), libc::F_PREALLOCATE, &mut params)
386                    })
387                    .map_err(Self::map_os_err)?;
388
389                    file.set_len(new_size)?;
390                }
391            }
392            PreallocateMode::WriteData => {
393                // FIXME: Keeping the lock would be nice, but resizing concurrently with I/O is
394                // pretty risky anyway.
395                drop(file);
396                write_full_zeroes(self, current_size, new_size - current_size).await?;
397            }
398        }
399
400        self.size.fetch_max(new_size, Ordering::Relaxed);
401        Ok(())
402    }
403}
404
405impl File {
406    /// Central internal function to create a `File` object.
407    ///
408    /// `direct_io` should be `true` if direct I/O was requested, and can be `false` if that status
409    /// is unknown.
410    fn new(
411        mut file: fs::File,
412        filename: Option<PathBuf>,
413        direct_io: bool,
414        #[cfg(target_os = "macos")] relaxed_sync: bool,
415    ) -> io::Result<Self> {
416        let size = get_file_size(&file).err_context(|| "Failed to determine file size")?;
417
418        #[cfg(all(unix, not(target_os = "macos")))]
419        let direct_io = direct_io || {
420            // Safe: No argument, returns result.
421            let res = unsafe { libc::fcntl(file.as_raw_fd(), libc::F_GETFL) };
422            res > 0 && (res & libc::O_DIRECT) != 0
423        };
424
425        let (min_req_align, min_mem_align) = if direct_io {
426            #[cfg(unix)]
427            {
428                (
429                    Self::get_min_dio_req_align(&file),
430                    Self::get_min_dio_mem_align(&file),
431                )
432            }
433
434            #[cfg(not(unix))]
435            {
436                (1, 1)
437            } // probe it then
438        } else {
439            (1, 1)
440        };
441
442        let (req_align, mem_align, zero_align, discard_align) =
443            Self::probe_alignments(&mut file, min_req_align, min_mem_align);
444        assert!(req_align.is_power_of_two());
445        assert!(mem_align.is_power_of_two());
446
447        Ok(File {
448            file: RwLock::new(file),
449            filename,
450            req_align,
451            mem_align,
452            zero_align,
453            discard_align,
454            size: size.into(),
455            common_storage_helper: Default::default(),
456            #[cfg(target_os = "macos")]
457            relaxed_sync,
458            discard_unsupported: AtomicBool::new(false),
459        })
460    }
461
462    /// Probe minimal request, memory, zero and discard alignments.
463    ///
464    /// Start at `min_req_align` and `min_mem_align`.
465    #[cfg(unix)]
466    fn probe_alignments(
467        file: &mut fs::File,
468        min_req_align: usize,
469        min_mem_align: usize,
470    ) -> (usize, usize, usize, usize) {
471        let mut page_size = page_size::get();
472        if !page_size.is_power_of_two() {
473            let assume = page_size.checked_next_power_of_two().unwrap_or(4096);
474            let assume = cmp::max(4096, assume);
475            warn!("Reported page size of {page_size} is not a power of two, assuming {assume}");
476            page_size = assume;
477        }
478
479        #[cfg(not(target_os = "macos"))]
480        let (zero_align, discard_align) = (1, 1);
481        #[cfg(target_os = "macos")]
482        let (zero_align, discard_align) = {
483            let mut statfs: libc::statfs = unsafe { std::mem::zeroed() };
484            // Safe: FD is valid, passed pointer is valid and its type matches the call.
485            match while_eintr(|| unsafe { libc::fstatfs(file.as_raw_fd(), &mut statfs) }) {
486                Ok(_) => (statfs.f_bsize as usize, statfs.f_bsize as usize),
487                Err(_) => (page_size, page_size),
488            }
489        };
490
491        let mut writable = true;
492
493        let max_req_align = 65536;
494        let max_mem_align = cmp::max(page_size, max_req_align);
495
496        // Minimum fallbacks in case something goes wrong.
497        let safe_req_align = 4096;
498        let safe_mem_align = cmp::max(page_size, safe_req_align);
499
500        let mut test_buf = match IoBuffer::new(max_mem_align, max_mem_align) {
501            Ok(buf) => buf,
502            Err(err) => {
503                warn!(
504                    "Failed to allocate memory to probe request alignment ({err}), \
505                    falling back to {safe_req_align}/{safe_mem_align}"
506                );
507                return (safe_req_align, safe_mem_align, zero_align, discard_align);
508            }
509        };
510
511        let mut req_align: usize = min_req_align;
512        let result = loop {
513            assert!(req_align <= max_mem_align);
514            match Self::probe_access(
515                file,
516                test_buf.as_mut_range(0..req_align).into_slice(),
517                req_align.try_into().unwrap(),
518                &mut writable,
519            ) {
520                Ok(true) => break Ok(req_align),
521                Ok(false) => {
522                    if req_align >= max_req_align {
523                        break Err(io::Error::other(format!(
524                            "Maximum I/O alignment ({max_req_align}) exceeded"
525                        )));
526                    }
527                    // No reason to probe anything between 1 and 512
528                    if req_align == min_req_align {
529                        req_align = cmp::max(min_req_align << 1, 512);
530                    } else {
531                        req_align <<= 1;
532                    }
533                }
534                Err(err) => break Err(err),
535            }
536        };
537
538        let req_align = match result {
539            Ok(align) => {
540                debug!("Probed request alignment: {align}");
541                align
542            }
543            Err(err) => {
544                // Failed to determine request alignment, use a presumably safe value
545                let align = cmp::max(req_align, safe_req_align);
546                warn!(
547                    "Failed to probe request alignment ({err}; {}), falling back to {align} bytes",
548                    err.kind(),
549                );
550                align
551            }
552        };
553
554        let mut mem_align: usize = min_mem_align;
555        let result = loop {
556            assert!(mem_align <= max_mem_align);
557            let range = (max_mem_align - mem_align)..max_mem_align;
558            match Self::probe_access(
559                file,
560                test_buf.as_mut_range(range).into_slice(),
561                0,
562                &mut writable,
563            ) {
564                Ok(true) => break Ok(mem_align),
565                Ok(false) => {
566                    // Not aligned
567                    if mem_align >= max_mem_align {
568                        break Err(io::Error::other(format!(
569                            "Maximum memory alignment ({max_mem_align}) exceeded"
570                        )));
571                    }
572                    // No reason to probe anything between 1 and the page size (or 4096 at least)
573                    if mem_align == min_mem_align {
574                        mem_align = cmp::max(min_mem_align << 1, cmp::min(page_size, 4096));
575                    } else {
576                        mem_align <<= 1;
577                    }
578                }
579                Err(err) => break Err(err),
580            }
581        };
582
583        let mem_align = match result {
584            Ok(align) => {
585                debug!("Probed memory alignment: {align}");
586                align
587            }
588            Err(err) => {
589                // Failed to determine memory alignment, use a presumably safe value
590                let align = cmp::max(mem_align, safe_mem_align);
591                warn!(
592                    "Failed to probe memory alignment ({err}; {}), falling back to {align} bytes",
593                    err.kind(),
594                );
595                align
596            }
597        };
598
599        (req_align, mem_align, zero_align, discard_align)
600    }
601
602    /// Do an alignment-probing I/O access.
603    ///
604    /// Return `Ok(true)` if everything was OK, and `Ok(false)` if the request was reported to be
605    /// misaligned.
606    ///
607    /// `may_write` is a boolean that controls whether this is allowed to write (the same data read
608    /// before) to improve reliability.  Is automatically set to `false` if writing is found to not
609    /// be possible.
610    #[cfg(unix)]
611    fn probe_access(
612        file: &mut fs::File,
613        slice: &mut [u8],
614        offset: libc::off_t,
615        may_write: &mut bool,
616    ) -> io::Result<bool> {
617        // Use `libc::pread` so we get well-defined errors.
618        // Safe: Passing the slice as the buffer it is.
619        let ret = while_eintr(|| unsafe {
620            libc::pread(
621                file.as_raw_fd(),
622                slice.as_mut_ptr() as *mut libc::c_void,
623                slice.len(),
624                offset,
625            )
626        });
627
628        if let Err(err) = ret {
629            if err.raw_os_error() == Some(libc::EINVAL) {
630                return Ok(false);
631            } else {
632                return Err(err);
633            }
634        }
635
636        if !*may_write {
637            return Ok(true);
638        }
639
640        // Safe: Passing the slice as the buffer it is.
641        let ret = while_eintr(|| unsafe {
642            libc::pwrite(
643                file.as_raw_fd(),
644                slice.as_ptr() as *const libc::c_void,
645                slice.len(),
646                offset,
647            )
648        });
649
650        if let Err(err) = ret {
651            if err.raw_os_error() == Some(libc::EINVAL) {
652                Ok(false)
653            } else if err.raw_os_error() == Some(libc::EBADF) {
654                *may_write = false;
655                Ok(true)
656            } else {
657                Err(err)
658            }
659        } else {
660            Ok(true)
661        }
662    }
663
664    /// Get system-reported minimum request alignment for direct I/O.
665    #[cfg(unix)]
666    fn get_min_dio_req_align(file: &fs::File) -> usize {
667        #[cfg(target_os = "linux")]
668        {
669            let mut alignment = 0;
670            let res = unsafe { ioctl::blksszget(file.as_raw_fd(), &mut alignment) };
671            if res.is_ok() && alignment > 0 {
672                let alignment = alignment as usize;
673                if alignment.is_power_of_two() {
674                    return alignment;
675                }
676            }
677        }
678
679        #[cfg(target_os = "macos")]
680        {
681            let mut alignment = 0;
682            let res = unsafe { ioctl::dkiocgetblocksize(file.as_raw_fd(), &mut alignment) };
683            if res.is_ok() && alignment.is_power_of_two() {
684                return alignment as usize;
685            }
686        }
687
688        #[cfg(target_os = "freebsd")]
689        {
690            let mut alignment = 0;
691            let res = unsafe { ioctl::diocgsectorsize(file.as_raw_fd(), &mut alignment) };
692            if res.is_ok() && alignment.is_power_of_two() {
693                return alignment as usize;
694            }
695        }
696
697        // Then we’ll probe.
698        1
699    }
700
701    /// Get system-reported minimum memory alignment for direct I/O.
702    #[cfg(unix)]
703    fn get_min_dio_mem_align(_file: &fs::File) -> usize {
704        // I don’t think there’s a reliable way to get this.
705        1
706    }
707
708    /// Probe minimal request and memory alignments.
709    ///
710    /// Start at `min_req_align` and `min_mem_align`.
711    #[cfg(windows)]
712    fn probe_alignments(
713        _file: &mut fs::File,
714        min_req_align: usize,
715        min_mem_align: usize,
716    ) -> (usize, usize, usize, usize) {
717        // TODO: Need to find out how Windows indicates unaligned I/O
718        (
719            cmp::max(min_req_align, 4096),
720            cmp::max(min_mem_align, 4096),
721            1,
722            1,
723        )
724    }
725
726    /// Implementation for anything that opens a file.
727    fn do_open_sync(opts: StorageOpenOptions, base_fs_opts: fs::OpenOptions) -> io::Result<Self> {
728        let Some(filename) = opts.filename else {
729            return Err(io::Error::new(
730                io::ErrorKind::InvalidInput,
731                "Filename required",
732            ));
733        };
734
735        let mut file_opts = base_fs_opts;
736        file_opts.read(true).write(opts.writable);
737        #[cfg(not(target_os = "macos"))]
738        if opts.direct {
739            file_opts.custom_flags(
740                #[cfg(unix)]
741                libc::O_DIRECT,
742                #[cfg(windows)]
743                windows_sys::Win32::Storage::FileSystem::FILE_FLAG_NO_BUFFERING,
744            );
745        }
746
747        let filename_owned = filename.to_owned();
748        let file = file_opts.open(filename)?;
749
750        #[cfg(target_os = "macos")]
751        if opts.direct {
752            // Safe: We check the return value.
753            while_eintr(|| unsafe { libc::fcntl(file.as_raw_fd(), libc::F_NOCACHE, 1) })
754                .err_context(|| "Failed to disable host cache")?;
755        }
756
757        Self::new(
758            file,
759            Some(filename_owned),
760            opts.direct,
761            #[cfg(target_os = "macos")]
762            opts.relaxed_sync,
763        )
764    }
765
766    /// For special operations, ensure the error kind is usable.
767    ///
768    /// When invoking OS I/O operations directly, we turn the returned raw OS error code into an
769    /// `io::Error` object via `io::Error::last_os_error()`.  To differentiate between different
770    /// error cases, in generic imago code, we then don’t use that raw error code, but the error
771    /// kind (`io::ErrorKind`) instead, specifically it’s important to properly return an error of
772    /// kind `io::ErrorKind::Unsupported` when an operation is unsupported, so fall-backs can be
773    /// employed.
774    ///
775    /// Rust’s standard library only assigns this error kind (`Unsupported`) to `EOPNOTSUPP` (=
776    /// `ENOTSUP`) and `ENOSYS`.  However, some “special” operations (`fallocate()`,
777    /// `fcntl(F_PUNCHHOLE)`, `ioctl()`, ...) can return other error codes for when an operation is
778    /// not supported on a specific file, e.g. `ENODEV` or `ENXIO`.
779    ///
780    /// Assign the appropriate error kind to such errors so the generic code can handle them.
781    #[cfg(unix)]
782    fn map_os_err(err: io::Error) -> io::Error {
783        let Some(raw) = err.raw_os_error() else {
784            return err;
785        };
786
787        let has_kind = err.kind();
788        let want_kind = match raw {
789            #[allow(unreachable_patterns)] // `ENOTSUP` may be equal to `EOPNOTSUPP`
790            libc::ENOTSUP | libc::EOPNOTSUPP | libc::ENODEV | libc::ENXIO | libc::ENOTTY => {
791                io::ErrorKind::Unsupported
792            }
793            _ => has_kind,
794        };
795
796        if has_kind != want_kind {
797            io::Error::new(want_kind, err)
798        } else {
799            err
800        }
801    }
802
803    /// For special operations, ensure the error kind is usable.
804    ///
805    /// For non-UNIX systems, this is an identity map.
806    #[cfg(not(unix))]
807    fn map_os_err(err: io::Error) -> io::Error {
808        err
809    }
810
811    /// Attempt to discard range by truncating the file.
812    ///
813    /// If the given range is at the end of the file, discard it by simply truncating the file.
814    /// Return `true` on success.
815    ///
816    /// If the range is not at the end of the file, i.e. another method of discarding is needed,
817    /// return `false`.
818    fn try_discard_by_truncate(&self, offset: u64, length: u64) -> io::Result<bool> {
819        // Prevent modifications to the file length
820        #[allow(clippy::readonly_write_lock)]
821        let file = self.file.write().unwrap();
822
823        let size = self.size.load(Ordering::Relaxed);
824        if offset >= size {
825            // Nothing to do
826            return Ok(true);
827        }
828
829        // If `offset + length` overflows, we can just assume it ends at `size`.  (Anything past
830        // `size is irrelevant anyway.)
831        let end = offset.checked_add(length).unwrap_or(size);
832        if end < size {
833            return Ok(false);
834        }
835
836        file.set_len(offset)?;
837        Ok(true)
838    }
839
840    /// Ensure the given range reads back as zeroes, or return an error.
841    async fn discard_to_zero(&self, offset: u64, length: u64) -> io::Result<()> {
842        if self.try_discard_by_truncate(offset, length)? {
843            return Ok(());
844        }
845
846        if self.discard_unsupported.load(Ordering::Relaxed) {
847            Err(io::ErrorKind::Unsupported.into())
848        } else if let Err(err) = self.discard_to_zero_os_specific(offset, length).await {
849            if err.kind() == io::ErrorKind::Unsupported {
850                self.discard_unsupported.store(true, Ordering::Relaxed);
851            }
852            Err(err)
853        } else {
854            Ok(())
855        }
856    }
857
858    /// Via OS-specific means, ensure the given range reads back as zeroes, or return an error.
859    #[cfg(target_os = "linux")]
860    async fn discard_to_zero_os_specific(&self, offset: u64, length: u64) -> io::Result<()> {
861        let offset: libc::off_t = offset
862            .try_into()
863            .map_err(|e| io::Error::other(format!("Discard/write-zeroes offset error: {e}")))?;
864        let length: libc::off_t = length
865            .try_into()
866            .map_err(|e| io::Error::other(format!("Discard/write-zeroes length error: {e}")))?;
867
868        let file = self.file.read().unwrap();
869        // Safe: File descriptor is valid, and the rest are simple integer parameters.
870        while_eintr(|| unsafe {
871            libc::fallocate(
872                file.as_raw_fd(),
873                libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE,
874                offset,
875                length,
876            )
877        })
878        .map_err(Self::map_os_err)?;
879
880        Ok(())
881    }
882
883    /// Via OS-specific means, ensure the given range reads back as zeroes, or return an error.
884    #[cfg(windows)]
885    async fn discard_to_zero_os_specific(&self, offset: u64, length: u64) -> io::Result<()> {
886        let offset: i64 = offset
887            .try_into()
888            .map_err(|e| io::Error::other(format!("Discard/write-zeroes offset error: {e}")))?;
889        let length: i64 = length
890            .try_into()
891            .map_err(|e| io::Error::other(format!("Discard/write-zeroes length error: {e}")))?;
892
893        let end = offset.saturating_add(length).saturating_add(1);
894        let params = FILE_ZERO_DATA_INFORMATION {
895            FileOffset: offset,
896            BeyondFinalZero: end,
897        };
898        let mut _returned = 0;
899        let file = self.file.read().unwrap();
900        // Safe: File handle is valid, mandatory pointers (input, returned length) are passed and
901        // valid, the parameter type matches the call, and the input size matches the object
902        // passed.
903        let ret = unsafe {
904            DeviceIoControl(
905                file.as_raw_handle(),
906                FSCTL_SET_ZERO_DATA,
907                (&params as *const FILE_ZERO_DATA_INFORMATION).cast::<std::ffi::c_void>(),
908                size_of_val(&params) as u32,
909                std::ptr::null_mut(),
910                0,
911                &mut _returned,
912                std::ptr::null_mut(),
913            )
914        };
915        if ret == 0 {
916            return Err(Self::map_os_err(io::Error::last_os_error()));
917        }
918
919        Ok(())
920    }
921
922    /// Via OS-specific means, ensure the given range reads back as zeroes, or return an error.
923    #[cfg(target_os = "macos")]
924    async fn discard_to_zero_os_specific(&self, offset: u64, length: u64) -> io::Result<()> {
925        let offset: libc::off_t = offset
926            .try_into()
927            .map_err(|e| io::Error::other(format!("Discard/write-zeroes offset error: {e}")))?;
928        let length: libc::off_t = length
929            .try_into()
930            .map_err(|e| io::Error::other(format!("Discard/write-zeroes length error: {e}")))?;
931
932        let params = libc::fpunchhole_t {
933            fp_flags: 0,
934            reserved: 0,
935            fp_offset: offset,
936            fp_length: length,
937        };
938        let file = self.file.read().unwrap();
939        // Safe: FD is valid, passed pointer is valid and its type matches the call.
940        while_eintr(|| unsafe { libc::fcntl(file.as_raw_fd(), libc::F_PUNCHHOLE, &params) })
941            .map_err(Self::map_os_err)?;
942
943        Ok(())
944    }
945
946    /// Via OS-specific means, ensure the given range reads back as zeroes, or return an error.
947    #[cfg(not(any(target_os = "linux", target_os = "macos", windows)))]
948    async fn discard_to_zero_os_specific(&self, offset: u64, length: u64) -> io::Result<()> {
949        Err(io::ErrorKind::Unsupported.into())
950    }
951}
952
953impl Display for File {
954    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
955        if let Some(filename) = self.filename.as_ref() {
956            write!(f, "file:{filename:?}")
957        } else {
958            write!(f, "file:<unknown path>")
959        }
960    }
961}
962
963/// Get total size in bytes of the given file.
964///
965/// If the file is a block or character device, use get_device_size() instead of
966/// reading len from metadata which doesn't work on some platforms like macOS.
967fn get_file_size(file: &fs::File) -> io::Result<u64> {
968    #[allow(clippy::bind_instead_of_map)]
969    file.metadata().and_then(|m| {
970        #[cfg(unix)]
971        if m.file_type().is_block_device() || m.file_type().is_char_device() {
972            return get_device_size(file);
973        }
974        Ok(m.len())
975    })
976}
977
978cfg_if! {
979    if #[cfg(target_os = "linux")] {
980        /// Get total size in bytes of the given block or character device.
981        fn get_device_size(file: &fs::File) -> io::Result<u64> {
982            let mut size = 0;
983            unsafe { ioctl::blkgetsize64(file.as_raw_fd(), &mut size) }?;
984            Ok(size)
985        }
986    } else if #[cfg(target_os = "macos")] {
987        /// Get total size in bytes of the given block or character device.
988        fn get_device_size(file: &fs::File) -> io::Result<u64> {
989            let mut block_size = 0;
990            unsafe { ioctl::dkiocgetblocksize(file.as_raw_fd(), &mut block_size) }?;
991            let mut block_count = 0;
992            unsafe { ioctl::dkiocgetblockcount(file.as_raw_fd(), &mut block_count) }?;
993            Ok(u64::from(block_size) * block_count)
994        }
995    } else if #[cfg(target_os = "freebsd")] {
996        /// Get total size in bytes of the given block or character device.
997        fn get_device_size(file: &fs::File) -> io::Result<u64> {
998            let mut size = 0;
999            unsafe { ioctl::diocgmediasize(file.as_raw_fd(), &mut size) }?;
1000            Ok(size as u64)
1001        }
1002    } else if #[cfg(unix)] {
1003        /// Get total size in bytes of the given block or character device - unsupported platform.
1004        fn get_device_size(_file: &fs::File) -> io::Result<u64> {
1005            Err(io::ErrorKind::Unsupported.into())
1006        }
1007    }
1008}
1009
1010/// This module generates type-safe wrappers for chosen ioctls
1011mod ioctl {
1012    #[cfg(unix)]
1013    use nix::ioctl_read;
1014    #[cfg(target_os = "linux")]
1015    use nix::ioctl_read_bad;
1016
1017    // https://github.com/torvalds/linux/blob/master/include/uapi/linux/fs.h#L200
1018
1019    #[cfg(target_os = "linux")]
1020    ioctl_read!(blkgetsize64, 0x12, 114, u64);
1021
1022    #[cfg(target_os = "linux")]
1023    ioctl_read_bad!(blksszget, libc::BLKSSZGET, libc::c_int);
1024
1025    // https://github.com/apple-oss-distributions/xnu/blob/main/bsd/sys/disk.h#L198-L199
1026
1027    #[cfg(target_os = "macos")]
1028    ioctl_read!(dkiocgetblocksize, 'd', 24, u32);
1029
1030    #[cfg(target_os = "macos")]
1031    ioctl_read!(dkiocgetblockcount, 'd', 25, u64);
1032
1033    // https://web.mit.edu/freebsd/head/sys/sys/disk.h
1034
1035    #[cfg(target_os = "freebsd")]
1036    ioctl_read!(diocgsectorsize, 'd', 128, libc::c_uint);
1037
1038    #[cfg(target_os = "freebsd")]
1039    ioctl_read!(diocgmediasize, 'd', 129, libc::off_t);
1040}