imago/qcow2/
metadata.rs

1//! Functionality for working with qcow2 metadata.
2
3use super::types::*;
4use crate::io_buffers::IoBuffer;
5use crate::macros::numerical_enum;
6use crate::misc_helpers::invalid_data;
7use crate::{Storage, StorageExt};
8use bincode::config::{BigEndian, Configuration as BincodeConfiguration, Fixint};
9use bincode::{Decode, Encode};
10use std::collections::HashMap;
11use std::mem::size_of;
12use std::num::TryFromIntError;
13use std::sync::atomic::{AtomicBool, AtomicU16, AtomicU32, AtomicU64, AtomicU8, Ordering};
14use std::{cmp, io};
15use tokio::sync::{Mutex, MutexGuard};
16use tracing::error;
17
18/// Qcow header magic ("QFI\xfb").
19pub(super) const MAGIC: u32 = 0x51_46_49_fb;
20
21/// Maximum file length.
22const MAX_FILE_LENGTH: u64 = 0x0100_0000_0000_0000u64;
23
24/// Maximum permissible host offset.
25pub(super) const MAX_OFFSET: HostOffset = HostOffset(MAX_FILE_LENGTH - 512);
26
27/// Minimum cluster size.
28///
29/// Defined by the specification.
30pub(super) const MIN_CLUSTER_SIZE: usize = 512;
31
32/// Maximum cluster size.
33///
34/// This is QEMU’s limit, so we can apply it, too.
35pub(super) const MAX_CLUSTER_SIZE: usize = 2 * 1024 * 1024;
36
37/// Minimum number of bits per refcount entry.
38pub(super) const MIN_REFCOUNT_WIDTH: usize = 1;
39
40/// Maximum number of bits per refcount entry.
41pub(super) const MAX_REFCOUNT_WIDTH: usize = 64;
42
43/// Bincode configuration for the qcow2 integer format
44const BINCODE_CFG: BincodeConfiguration<BigEndian, Fixint> = bincode::config::standard()
45    .with_fixed_int_encoding()
46    .with_big_endian();
47
48/// Qcow2 v2 header.
49#[derive(Decode, Encode)]
50struct V2Header {
51    /// Qcow magic string ("QFI\xfb").
52    magic: u32,
53
54    /// Version number (valid values are 2 and 3).
55    version: u32,
56
57    /// Offset into the image file at which the backing file name is stored (NB: The string is not
58    /// null terminated).  0 if the image doesn’t have a backing file.
59    ///
60    /// Note: backing files are incompatible with raw external data files (auto-clear feature bit
61    /// 1).
62    backing_file_offset: u64,
63
64    /// Length of the backing file name in bytes.  Must not be longer than 1023 bytes.  Undefined
65    /// if the image doesn’t have a backing file.
66    backing_file_size: u32,
67
68    /// Number of bits that are used for addressing an offset within a cluster (`1 << cluster_bits`
69    /// is the cluster size).  Must not be less than 9 (i.e. 512 byte clusters).
70    ///
71    /// Note: qemu as of today has an implementation limit of 2 MB as the maximum cluster size and
72    /// won’t be able to open images with larger cluster sizes.
73    ///
74    /// Note: if the image has Extended L2 Entries then `cluster_bits` must be at least 14 (i.e.
75    /// 16384 byte clusters).
76    cluster_bits: u32,
77
78    /// Virtual disk size in bytes.
79    ///
80    /// Note: qemu has an implementation limit of 32 MB as the maximum L1 table size.  With a 2 MB
81    /// cluster size, it is unable to populate a virtual cluster beyond 2 EB (61 bits); with a 512
82    /// byte cluster size, it is unable to populate a virtual size larger than 128 GB (37 bits).
83    /// Meanwhile, L1/L2 table layouts limit an image to no more than 64 PB (56 bits) of populated
84    /// clusters, and an image may hit other limits first (such as a file system’s maximum size).
85    size: AtomicU64,
86
87    /// Encryption method:
88    ///
89    /// 0. no encryption
90    /// 1. AES encryption
91    /// 2. LUKS encryption
92    crypt_method: u32,
93
94    /// Number of entries in the active L1 table.
95    l1_size: AtomicU32,
96
97    /// Offset into the image file at which the active L1 table starts.  Must be aligned to a
98    /// cluster boundary.
99    l1_table_offset: AtomicU64,
100
101    /// Offset into the image file at which the refcount table starts.  Must be aligned to a
102    /// cluster boundary.
103    refcount_table_offset: AtomicU64,
104
105    /// Number of clusters that the refcount table occupies.
106    refcount_table_clusters: AtomicU32,
107
108    /// Number of snapshots contained in the image.
109    nb_snapshots: u32,
110
111    /// Offset into the image file at which the snapshot table starts.  Must be aligned to a
112    /// cluster boundary.
113    snapshots_offset: u64,
114}
115
116impl V2Header {
117    /// Raw v2 header length.
118    const RAW_SIZE: usize = 72;
119}
120
121/// Qcow2 v3 header.
122#[derive(Decode, Encode)]
123struct V3HeaderBase {
124    /// Bitmask of incompatible features.  An implementation must fail to open an image if an
125    /// unknown bit is set.
126    ///
127    /// 0. Dirty bit.  If this bit is set then refcounts may be inconsistent, make sure to scan
128    ///    L1/L2 tables to repair refcounts before accessing the image.
129    /// 1. Corrupt bit.  If this bit is set then any data structure may be corrupt and the image
130    ///    must not be written to (unless for regaining consistency).
131    /// 2. External data file bit.  If this bit is set, an external data file is used.  Guest
132    ///    clusters are then stored in the external data file.  For such images, clusters in the
133    ///    external data file are not refcounted.  The offset field in the Standard Cluster
134    ///    Descriptor must match the guest offset and neither compressed clusters nor internal
135    ///    snapshots are supported.  An External Data File Name header extension may be present if
136    ///    this bit is set.
137    /// 3. Compression type bit.  If this bit is set, a non-default compression is used for
138    ///    compressed clusters.  The compression_type field must be present and not zero.
139    /// 4. Extended L2 Entries.  If this bit is set then L2 table entries use an extended format
140    ///    that allows subcluster-based allocation.  See the Extended L2 Entries section for more
141    ///    details.
142    ///
143    /// Bits 5-63 are reserved (set to 0).
144    incompatible_features: u64,
145
146    /// Bitmask of compatible features.  An implementation can safely ignore any unknown bits that
147    /// are set.
148    ///
149    /// 0. Lazy refcounts bit.  If this bit is set then lazy refcount updates can be used.  This
150    ///    means marking the image file dirty and postponing refcount metadata updates.
151    ///
152    /// Bits 1-63 are reserved (set to 0).
153    compatible_features: u64,
154
155    /// Bitmask of auto-clear features.  An implementation may only write to an image with unknown
156    /// auto-clear features if it clears the respective bits from this field first.
157    ///
158    /// 0. Bitmaps extension bit.  This bit indicates consistency for the bitmaps extension data.
159    ///    It is an error if this bit is set without the bitmaps extension present.  If the bitmaps
160    ///    extension is present but this bit is unset, the bitmaps extension data must be
161    ///    considered inconsistent.
162    /// 1. Raw external data bit.  If this bit is set, the external data file can be read as a
163    ///    consistent standalone raw image without looking at the qcow2 metadata.  Setting this bit
164    ///    has a performance impact for some operations on the image (e.g. writing zeros requires
165    ///    writing to the data file instead of only setting the zero flag in the L2 table entry)
166    ///    and conflicts with backing files.  This bit may only be set if the External Data File
167    ///    bit (incompatible feature bit 1) is also set.
168    ///
169    /// Bits 2-63 are reserved (set to 0).
170    autoclear_features: u64,
171
172    /// Describes the width of a reference count block entry (width in bits: `refcount_bits = 1 <<
173    /// refcount_order`).  For version 2 images, the order is always assumed to be 4 (i.e.
174    /// `refcount_bits = 16`).  This value may not exceed 6 (i.e. `refcount_bits = 64`).
175    refcount_order: u32,
176
177    /// Length of the header structure in bytes.  For version 2 images, the length is always
178    /// assumed to be 72 bytes.  For version 3 it’s at least 104 bytes and must be a multiple of 8.
179    header_length: u32,
180}
181
182impl V3HeaderBase {
183    /// Raw v3 header length beyond the v2 header.
184    const RAW_SIZE: usize = 104 - V2Header::RAW_SIZE;
185}
186
187impl Default for V3HeaderBase {
188    fn default() -> Self {
189        V3HeaderBase {
190            incompatible_features: 0,
191            compatible_features: 0,
192            autoclear_features: 0,
193            refcount_order: 4,
194            header_length: (V2Header::RAW_SIZE + V3HeaderBase::RAW_SIZE) as u32,
195        }
196    }
197}
198
199numerical_enum! {
200    /// Incompatible feature bits.
201    pub(super) enum IncompatibleFeatures as u64 {
202        Dirty = 1 << 0,
203        Corrupt = 1 << 1,
204        ExternalDataFile = 1 << 2,
205        CompressionType = 1 << 3,
206        ExtendedL2Entries = 1 << 4,
207    }
208}
209
210impl From<IncompatibleFeatures> for (FeatureType, u8) {
211    /// Get this feature’s feature name table key.
212    fn from(feat: IncompatibleFeatures) -> (FeatureType, u8) {
213        assert!((feat as u64).is_power_of_two());
214        (
215            FeatureType::Incompatible,
216            (feat as u64).trailing_zeros() as u8,
217        )
218    }
219}
220
221numerical_enum! {
222    /// Compatible feature bits.
223    pub(super) enum CompatibleFeatures as u64 {
224        LazyRefcounts = 1 << 0,
225    }
226}
227
228impl From<CompatibleFeatures> for (FeatureType, u8) {
229    /// Get this feature’s feature name table key.
230    fn from(feat: CompatibleFeatures) -> (FeatureType, u8) {
231        assert!((feat as u64).is_power_of_two());
232        (
233            FeatureType::Compatible,
234            (feat as u64).trailing_zeros() as u8,
235        )
236    }
237}
238
239numerical_enum! {
240    /// Autoclear feature bits.
241    pub(super) enum AutoclearFeatures as u64 {
242        Bitmaps = 1 << 0,
243        RawExternalData = 1 << 1,
244    }
245}
246
247impl From<AutoclearFeatures> for (FeatureType, u8) {
248    /// Get this feature’s feature name table key.
249    fn from(feat: AutoclearFeatures) -> (FeatureType, u8) {
250        assert!((feat as u64).is_power_of_two());
251        (FeatureType::Autoclear, (feat as u64).trailing_zeros() as u8)
252    }
253}
254
255numerical_enum! {
256    /// Extension type IDs.
257    pub(super) enum HeaderExtensionType as u32 {
258        /// End of extension list.
259        End = 0,
260
261        /// Backing file format string.
262        BackingFileFormat = 0xe2792aca,
263
264        /// Map of feature bits to human-readable names.
265        FeatureNameTable = 0x6803f857,
266
267        /// External data file filename string.
268        ExternalDataFileName = 0x44415441,
269    }
270}
271
272/// Header for a header extension.
273#[derive(Default, Decode, Encode)]
274struct HeaderExtensionHeader {
275    /// Type code of the header extension.
276    extension_type: u32,
277
278    /// Data length.
279    length: u32,
280}
281
282impl HeaderExtensionHeader {
283    /// Raw struct length.
284    const RAW_SIZE: usize = 8;
285}
286
287numerical_enum! {
288    /// Feature type ID for the feature name table.
289    #[derive(Hash)]
290    pub(super) enum FeatureType as u8 {
291        Incompatible = 0,
292        Compatible = 1,
293        Autoclear = 2,
294    }
295}
296
297/// Header extensions (high-level representation).
298#[derive(Debug, Clone, Eq, PartialEq)]
299pub(super) enum HeaderExtension {
300    /// Backing file format string.
301    BackingFileFormat(String),
302
303    /// Map of feature bits to human-readable names.
304    FeatureNameTable(HashMap<(FeatureType, u8), String>),
305
306    /// External data file filename string.
307    ExternalDataFileName(String),
308
309    /// Unknown extension.
310    Unknown {
311        /// Type.
312        extension_type: u32,
313        /// Data (as read).
314        data: Vec<u8>,
315    },
316}
317
318/// Integrated header representation.
319pub(super) struct Header {
320    /// v2 part of the header.
321    v2: V2Header,
322
323    /// Base v3 part of the header.
324    v3: V3HeaderBase,
325
326    /// Unrecognized header fields.
327    unknown_header_fields: Vec<u8>,
328
329    /// Backing filename string.
330    backing_filename: Option<String>,
331
332    /// Extensions.
333    extensions: Vec<HeaderExtension>,
334
335    /// Whether an external data file is required.
336    external_data_file: bool,
337}
338
339impl Header {
340    /// Load the qcow2 header from disk.
341    ///
342    /// If `writable` is false, do not perform any modifications (e.g. clearing auto-clear bits).
343    pub async fn load<S: Storage>(image: &S, writable: bool) -> io::Result<Self> {
344        // TODO: More sanity checks.
345        let mut header_buf = vec![0u8; V2Header::RAW_SIZE];
346        image.read(header_buf.as_mut_slice(), 0).await?;
347
348        let header: V2Header = decode_binary(&header_buf)?;
349        if header.magic != MAGIC {
350            return Err(invalid_data("Not a qcow2 file"));
351        }
352
353        let v3header_base = if header.version == 2 {
354            V3HeaderBase::default()
355        } else if header.version == 3 {
356            let mut header_buf = vec![0u8; V3HeaderBase::RAW_SIZE];
357            image
358                .read(header_buf.as_mut_slice(), V2Header::RAW_SIZE as u64)
359                .await?;
360            decode_binary(&header_buf)?
361        } else {
362            return Err(invalid_data(format!(
363                "qcow2 v{} is not supported",
364                header.version
365            )));
366        };
367
368        let cluster_size = 1usize.checked_shl(header.cluster_bits).ok_or_else(|| {
369            invalid_data(format!("Invalid cluster size: 2^{}", header.cluster_bits))
370        })?;
371        if !(MIN_CLUSTER_SIZE..=MAX_CLUSTER_SIZE).contains(&cluster_size) {
372            return Err(invalid_data(format!(
373                "Invalid cluster size: {cluster_size}; must be between {MIN_CLUSTER_SIZE} and {MAX_CLUSTER_SIZE}",
374            )));
375        }
376
377        let min_header_size = V2Header::RAW_SIZE + V3HeaderBase::RAW_SIZE;
378        if (v3header_base.header_length as usize) < min_header_size {
379            return Err(invalid_data(format!(
380                "qcow2 header too short: {} < {min_header_size}",
381                v3header_base.header_length,
382            )));
383        } else if (v3header_base.header_length as usize) > cluster_size {
384            return Err(invalid_data(format!(
385                "qcow2 header too big: {} > {cluster_size}",
386                v3header_base.header_length,
387            )));
388        }
389
390        let unknown_header_fields = if header.version == 2 {
391            Vec::new()
392        } else {
393            let mut unknown_header_fields =
394                vec![0u8; v3header_base.header_length as usize - min_header_size];
395            image
396                .read(&mut unknown_header_fields, min_header_size as u64)
397                .await?;
398            unknown_header_fields
399        };
400
401        let l1_offset = HostOffset(header.l1_table_offset.load(Ordering::Relaxed));
402        l1_offset
403            .checked_cluster(header.cluster_bits)
404            .ok_or_else(|| invalid_data(format!("Unaligned L1 table: {l1_offset}")))?;
405
406        let rt_offset = HostOffset(header.refcount_table_offset.load(Ordering::Relaxed));
407        rt_offset
408            .checked_cluster(header.cluster_bits)
409            .ok_or_else(|| invalid_data(format!("Unaligned refcount table: {rt_offset}")))?;
410
411        let rc_width = 1usize
412            .checked_shl(v3header_base.refcount_order)
413            .ok_or_else(|| {
414                invalid_data(format!(
415                    "Invalid refcount width: 2^{}",
416                    v3header_base.refcount_order
417                ))
418            })?;
419        if !(MIN_REFCOUNT_WIDTH..=MAX_REFCOUNT_WIDTH).contains(&rc_width) {
420            return Err(invalid_data(format!(
421                "Invalid refcount width: {rc_width}; must be between {MIN_REFCOUNT_WIDTH} and {MAX_REFCOUNT_WIDTH}",
422            )));
423        }
424
425        let backing_filename = if header.backing_file_offset != 0 {
426            let (offset, length) = (header.backing_file_offset, header.backing_file_size);
427            if length > 1023 {
428                return Err(invalid_data(format!(
429                    "Backing file name is too long ({length}, must not exceed 1023)"
430                )));
431            }
432
433            let end = offset.checked_add(length as u64).ok_or(invalid_data(
434                "Backing file name offset is invalid (too high)",
435            ))?;
436            if end >= cluster_size as u64 {
437                return Err(invalid_data(
438                    "Backing file name offset is invalid (beyond first cluster)",
439                ));
440            }
441
442            let mut backing_buf = vec![0; length as usize];
443            image.read(&mut backing_buf, offset).await?;
444
445            Some(
446                String::from_utf8(backing_buf)
447                    .map_err(|err| invalid_data(format!("Backing file name is invalid: {err}")))?,
448            )
449        } else {
450            None
451        };
452
453        let extensions = if header.version == 2 {
454            Vec::new()
455        } else {
456            let mut ext_offset: u64 = v3header_base.header_length as u64;
457            let mut extensions = Vec::<HeaderExtension>::new();
458            loop {
459                if ext_offset + HeaderExtensionHeader::RAW_SIZE as u64 > cluster_size as u64 {
460                    return Err(invalid_data("Header extensions exceed the first cluster"));
461                }
462
463                let mut ext_hdr_buf = vec![0; HeaderExtensionHeader::RAW_SIZE];
464                image.read(&mut ext_hdr_buf, ext_offset).await?;
465
466                ext_offset += HeaderExtensionHeader::RAW_SIZE as u64;
467
468                let ext_hdr: HeaderExtensionHeader = decode_binary(&ext_hdr_buf)?;
469                let ext_end = ext_offset
470                    .checked_add(ext_hdr.length as u64)
471                    .ok_or_else(|| invalid_data("Header size overflow"))?;
472                if ext_end > cluster_size as u64 {
473                    return Err(invalid_data("Header extensions exceed the first cluster"));
474                }
475
476                let mut ext_data = vec![0; ext_hdr.length as usize];
477                image.read(&mut ext_data, ext_offset).await?;
478
479                ext_offset += (ext_hdr.length as u64).next_multiple_of(8);
480
481                let Some(extension) =
482                    HeaderExtension::deserialize(ext_hdr.extension_type, ext_data)?
483                else {
484                    break;
485                };
486
487                extensions.push(extension);
488            }
489            extensions
490        };
491
492        // Check for header extension conflicts
493        let backing_fmt = extensions
494            .iter()
495            .find(|ext| matches!(ext, HeaderExtension::BackingFileFormat(_)));
496        if let Some(backing_fmt) = backing_fmt {
497            let conflicting = extensions.iter().find(|ext| {
498                matches!(ext, HeaderExtension::BackingFileFormat(_)) && ext != &backing_fmt
499            });
500            if let Some(conflicting) = conflicting {
501                return Err(io::Error::other(format!(
502                    "Found conflicting backing file formats: {backing_fmt:?} != {conflicting:?}",
503                )));
504            }
505        }
506        let ext_data_file = extensions
507            .iter()
508            .find(|ext| matches!(ext, HeaderExtension::ExternalDataFileName(_)));
509        if let Some(ext_data_file) = ext_data_file {
510            let conflicting = extensions.iter().find(|ext| {
511                matches!(ext, HeaderExtension::ExternalDataFileName(_)) && ext != &ext_data_file
512            });
513            if let Some(conflicting) = conflicting {
514                return Err(io::Error::other(format!(
515                    "Found conflicting external data file names: {ext_data_file:?} != {conflicting:?}",
516                )));
517            }
518        }
519
520        let mut incompatible_features = v3header_base.incompatible_features;
521        let autoclear_features = v3header_base.autoclear_features;
522
523        let external_data_file =
524            incompatible_features & IncompatibleFeatures::ExternalDataFile as u64 != 0;
525        incompatible_features &= !(IncompatibleFeatures::ExternalDataFile as u64);
526
527        let mut header = Header {
528            v2: header,
529            v3: v3header_base,
530            unknown_header_fields,
531            backing_filename,
532            extensions,
533            external_data_file,
534        };
535
536        // No need to clear autoclear features for read-only images
537        if autoclear_features != 0 && writable {
538            header.v3.autoclear_features = 0;
539            header.write(image).await?;
540        }
541
542        if incompatible_features != 0 {
543            let feats = (0..64)
544                .filter(|bit| header.v3.incompatible_features & (1u64 << bit) != 0)
545                .map(|bit| {
546                    if let Some(name) = header.feature_name(FeatureType::Incompatible, bit) {
547                        format!("{bit} ({name})")
548                    } else {
549                        format!("{bit}")
550                    }
551                })
552                .collect::<Vec<String>>();
553
554            return Err(invalid_data(format!(
555                "Unrecognized incompatible feature(s) {}",
556                feats.join(", ")
557            )));
558        }
559
560        Ok(header)
561    }
562
563    /// Write the qcow2 header to disk.
564    pub async fn write<S: Storage>(&mut self, image: &S) -> io::Result<()> {
565        let header_len = if self.v2.version > 2 {
566            let len = encoded_size(&self.v2).unwrap()
567                + encoded_size(&self.v3).unwrap()
568                + self.unknown_header_fields.len();
569            let len = len.next_multiple_of(8);
570            self.v3.header_length = len as u32;
571            len
572        } else {
573            V2Header::RAW_SIZE
574        };
575
576        // If the header gets too long, try to remove the feature name table to make it small
577        // enough
578        let mut header_exts;
579        let mut backing_file_ofs;
580        loop {
581            header_exts = self.serialize_extensions()?;
582
583            backing_file_ofs = header_len
584                .checked_add(header_exts.len())
585                .ok_or_else(|| invalid_data("Header size overflow"))?;
586            let backing_file_len = self
587                .backing_filename
588                .as_ref()
589                .map(|n| n.len()) // length in bytes
590                .unwrap_or(0);
591            let header_end = backing_file_ofs
592                .checked_add(backing_file_len)
593                .ok_or_else(|| invalid_data("Header size overflow"))?;
594
595            if header_end <= self.cluster_size() {
596                break;
597            }
598
599            if !self
600                .extensions
601                .iter()
602                .any(|e| e.extension_type() == HeaderExtensionType::FeatureNameTable as u32)
603            {
604                return Err(io::Error::other(format!(
605                    "Header would be too long ({header_end} > {})",
606                    self.cluster_size()
607                )));
608            }
609            self.extensions
610                .retain(|e| e.extension_type() != HeaderExtensionType::FeatureNameTable as u32);
611        }
612
613        if let Some(backing) = self.backing_filename.as_ref() {
614            self.v2.backing_file_offset = backing_file_ofs as u64;
615            self.v2.backing_file_size = backing.len() as u32; // length in bytes
616        } else {
617            self.v2.backing_file_offset = 0;
618            self.v2.backing_file_size = 0;
619        };
620
621        let mut full_buf = encode_binary(&self.v2)?;
622        if self.v2.version > 2 {
623            full_buf.append(&mut encode_binary(&self.v3)?);
624            full_buf.extend_from_slice(&self.unknown_header_fields);
625            full_buf.resize(full_buf.len().next_multiple_of(8), 0);
626        }
627
628        full_buf.append(&mut header_exts);
629
630        if let Some(backing) = self.backing_filename.as_ref() {
631            full_buf.extend_from_slice(backing.as_bytes());
632        }
633
634        if full_buf.len() > self.cluster_size() {
635            return Err(io::Error::other(format!(
636                "Header is too big to write ({}, larger than a cluster ({}))",
637                full_buf.len(),
638                self.cluster_size(),
639            )));
640        }
641
642        image.write(&full_buf, 0).await
643    }
644
645    /// Create a header for a new image.
646    pub fn new(
647        cluster_bits: u32,
648        refcount_order: u32,
649        backing_filename: Option<String>,
650        backing_format: Option<String>,
651        external_data_file: Option<String>,
652    ) -> Self {
653        assert!((MIN_CLUSTER_SIZE..=MAX_CLUSTER_SIZE)
654            .contains(&1usize.checked_shl(cluster_bits).unwrap()));
655        assert!((MIN_REFCOUNT_WIDTH..=MAX_REFCOUNT_WIDTH)
656            .contains(&1usize.checked_shl(refcount_order).unwrap()));
657
658        let has_external_data_file = external_data_file.is_some();
659        let incompatible_features = if has_external_data_file {
660            IncompatibleFeatures::ExternalDataFile as u64
661        } else {
662            0
663        };
664
665        let mut extensions = vec![HeaderExtension::feature_name_table()];
666        if let Some(backing_format) = backing_format {
667            extensions.push(HeaderExtension::BackingFileFormat(backing_format));
668        }
669        if let Some(external_data_file) = external_data_file {
670            extensions.push(HeaderExtension::ExternalDataFileName(external_data_file));
671        }
672
673        Header {
674            v2: V2Header {
675                magic: MAGIC,
676                version: 3,
677                backing_file_offset: 0, // will be set by `Self::write()`
678                backing_file_size: 0,   // will be set by `Self::write()`
679                cluster_bits,
680                size: 0.into(),
681                crypt_method: 0,
682                l1_size: 0.into(),
683                l1_table_offset: 0.into(),
684                refcount_table_offset: 0.into(),
685                refcount_table_clusters: 0.into(),
686                nb_snapshots: 0,
687                snapshots_offset: 0,
688            },
689            v3: V3HeaderBase {
690                incompatible_features,
691                compatible_features: 0,
692                autoclear_features: 0,
693                refcount_order,
694                header_length: 0, // will be set by `Self::write()`
695            },
696            unknown_header_fields: Vec::new(),
697            backing_filename,
698            extensions,
699            external_data_file: has_external_data_file,
700        }
701    }
702
703    /// Update from a newly loaded header.
704    ///
705    /// Checks whether fields we consider immutable have remained the same, and updates mutable
706    /// fields.
707    pub fn update(&self, new_header: &Header) -> io::Result<()> {
708        /// Verify that the given field matches in `self` and `new_header`.
709        macro_rules! check_field {
710            ($($field:ident).*) => {
711                (self.$($field).* == new_header.$($field).*).then_some(()).ok_or_else(|| {
712                    io::Error::other(format!(
713                        "Incompatible header modification on {}: {} != {}",
714                        stringify!($($field).*),
715                        self.$($field).*,
716                        new_header.$($field).*
717                    ))
718                })
719            };
720        }
721
722        check_field!(v2.magic)?;
723        check_field!(v2.version)?;
724        check_field!(v2.backing_file_offset)?; // TODO: Should be mutable
725        check_field!(v2.backing_file_size)?; // TODO: Should be mutable
726        check_field!(v2.cluster_bits)?;
727        // Size is mutable
728        // L1 position is mutable
729        // Reftable position is mutable
730        check_field!(v2.crypt_method)?;
731        check_field!(v2.nb_snapshots)?; // TODO: Should be mutable
732        check_field!(v2.snapshots_offset)?; // TODO: Should be mutable
733        check_field!(v3.incompatible_features)?; // TODO: Should be mutable
734        check_field!(v3.compatible_features)?; // TODO: Should be mutable
735        check_field!(v3.autoclear_features)?; // TODO: Should be mutable
736        check_field!(v3.refcount_order)?;
737        // header length is OK to ignore (as long as it’s valid)
738
739        // TODO: Should be mutable
740        (self.unknown_header_fields == new_header.unknown_header_fields)
741            .then_some(())
742            .ok_or_else(|| io::Error::other("Unknown header fields modified"))?;
743        // TODO: Should be mutable
744        (self.backing_filename == new_header.backing_filename)
745            .then_some(())
746            .ok_or_else(|| io::Error::other("Backing filename modified"))?;
747        // TODO: Should be mutable
748        (self.extensions == new_header.extensions)
749            .then_some(())
750            .ok_or_else(|| io::Error::other("Header extensions modified"))?;
751
752        check_field!(external_data_file)?;
753
754        self.v2.size.store(
755            new_header.v2.size.load(Ordering::Relaxed),
756            Ordering::Relaxed,
757        );
758
759        self.v2.l1_table_offset.store(
760            new_header.v2.l1_table_offset.load(Ordering::Relaxed),
761            Ordering::Relaxed,
762        );
763        self.v2.l1_size.store(
764            new_header.v2.l1_size.load(Ordering::Relaxed),
765            Ordering::Relaxed,
766        );
767        self.v2.refcount_table_offset.store(
768            new_header.v2.refcount_table_offset.load(Ordering::Relaxed),
769            Ordering::Relaxed,
770        );
771        self.v2.refcount_table_clusters.store(
772            new_header
773                .v2
774                .refcount_table_clusters
775                .load(Ordering::Relaxed),
776            Ordering::Relaxed,
777        );
778
779        Ok(())
780    }
781
782    /// Guest disk size.
783    pub fn size(&self) -> u64 {
784        self.v2.size.load(Ordering::Relaxed)
785    }
786
787    /// Require a minimum qcow2 version.
788    ///
789    /// Return an error if the version requirement is not met.
790    pub fn require_version(&self, minimum: u32) -> io::Result<()> {
791        let version = self.v2.version;
792        if version >= minimum {
793            Ok(())
794        } else {
795            Err(io::Error::new(
796                io::ErrorKind::Unsupported,
797                format!("qcow2 version {minimum} required, image has version {version}"),
798            ))
799        }
800    }
801
802    /// Set the guest disk size.
803    pub fn set_size(&self, new_size: u64) {
804        self.v2.size.store(new_size, Ordering::Relaxed)
805    }
806
807    /// log2 of the cluster size.
808    pub fn cluster_bits(&self) -> u32 {
809        self.v2.cluster_bits
810    }
811
812    /// Cluster size in bytes.
813    pub fn cluster_size(&self) -> usize {
814        1 << self.cluster_bits()
815    }
816
817    /// Number of entries per L2 table.
818    pub fn l2_entries(&self) -> usize {
819        // 3 == log2(size_of::<u64>())
820        1 << (self.cluster_bits() - 3)
821    }
822
823    /// log2 of the number of entries per refcount block.
824    pub fn rb_bits(&self) -> u32 {
825        // log2(cluster_size / (refcount_bits / 8 bits per byte))
826        // = log2(cluster_size * 8 / refcount_bits)
827        // = log2(cluster_size) + log2(8) - log2(refcount_bits)
828        self.cluster_bits() + 3 - self.refcount_order()
829    }
830
831    /// Number of entries per refcount block.
832    pub fn rb_entries(&self) -> usize {
833        1 << self.rb_bits()
834    }
835
836    /// log2 of the refcount bits.
837    pub fn refcount_order(&self) -> u32 {
838        self.v3.refcount_order
839    }
840
841    /// Offset of the L1 table.
842    pub fn l1_table_offset(&self) -> HostOffset {
843        HostOffset(self.v2.l1_table_offset.load(Ordering::Relaxed))
844    }
845
846    /// Number of entries in the L1 table.
847    pub fn l1_table_entries(&self) -> usize {
848        self.v2.l1_size.load(Ordering::Relaxed) as usize
849    }
850
851    /// Enter a new L1 table in the image header.
852    pub fn set_l1_table(&self, l1_table: &L1Table) -> io::Result<()> {
853        let offset = l1_table.get_offset().ok_or_else(|| {
854            io::Error::new(
855                io::ErrorKind::InvalidInput,
856                "New L1 table has no assigned offset",
857            )
858        })?;
859
860        let entries = l1_table.entries();
861        let entries = entries
862            .try_into()
863            .map_err(|err| invalid_data(format!("Too many L1 entries ({entries}): {err}")))?;
864
865        self.v2.l1_table_offset.store(offset.0, Ordering::Relaxed);
866
867        self.v2.l1_size.store(entries, Ordering::Relaxed);
868
869        Ok(())
870    }
871
872    /// Offset of the refcount table.
873    pub fn reftable_offset(&self) -> HostOffset {
874        HostOffset(self.v2.refcount_table_offset.load(Ordering::Relaxed))
875    }
876
877    /// Number of clusters occupied by the refcount table.
878    pub fn reftable_clusters(&self) -> ClusterCount {
879        ClusterCount(self.v2.refcount_table_clusters.load(Ordering::Relaxed) as u64)
880    }
881
882    /// Number of entries in the refcount table.
883    pub fn reftable_entries(&self) -> usize {
884        // 3 == log2(size_of::<u64>())
885        (self.reftable_clusters().byte_size(self.cluster_bits()) >> 3) as usize
886    }
887
888    /// Enter a new refcount table in the image header.
889    pub fn set_reftable(&self, reftable: &RefTable) -> io::Result<()> {
890        let offset = reftable.get_offset().ok_or_else(|| {
891            io::Error::new(
892                io::ErrorKind::InvalidInput,
893                "New refcount table has no assigned offset",
894            )
895        })?;
896
897        let clusters = reftable.cluster_count();
898        let clusters = clusters.0.try_into().map_err(|err| {
899            invalid_data(format!("Too many reftable clusters ({clusters}): {err}"))
900        })?;
901
902        self.v2
903            .refcount_table_clusters
904            .store(clusters, Ordering::Relaxed);
905
906        self.v2
907            .refcount_table_offset
908            .store(offset.0, Ordering::Relaxed);
909
910        Ok(())
911    }
912
913    /// Backing filename from the image header (if any).
914    pub fn backing_filename(&self) -> Option<&String> {
915        self.backing_filename.as_ref()
916    }
917
918    /// Backing format string from the image header (if any).
919    pub fn backing_format(&self) -> Option<&String> {
920        self.extensions.iter().find_map(|e| match e {
921            HeaderExtension::BackingFileFormat(fmt) => Some(fmt),
922            _ => None,
923        })
924    }
925
926    /// Whether this image requires an external data file.
927    pub fn external_data_file(&self) -> bool {
928        self.external_data_file
929    }
930
931    /// External data file filename from the image header (if any).
932    pub fn external_data_filename(&self) -> Option<&String> {
933        self.extensions.iter().find_map(|e| match e {
934            HeaderExtension::ExternalDataFileName(filename) => Some(filename),
935            _ => None,
936        })
937    }
938
939    /// Translate a feature bit to a human-readable name.
940    ///
941    /// Uses the feature name table from the image header, if present.
942    pub fn feature_name(&self, feat_type: FeatureType, bit: u32) -> Option<&String> {
943        for e in &self.extensions {
944            if let HeaderExtension::FeatureNameTable(names) = e {
945                if let Some(name) = names.get(&(feat_type, bit as u8)) {
946                    return Some(name);
947                }
948            }
949        }
950
951        None
952    }
953
954    /// Serialize all header extensions.
955    fn serialize_extensions(&self) -> io::Result<Vec<u8>> {
956        let mut result = Vec::new();
957        for e in &self.extensions {
958            let mut data = e.serialize_data()?;
959            let ext_hdr = HeaderExtensionHeader {
960                extension_type: e.extension_type(),
961                length: data.len().try_into().map_err(|err| {
962                    invalid_data(format!("Header extension too long ({}): {err}", data.len()))
963                })?,
964            };
965            result.append(&mut encode_binary(&ext_hdr)?);
966            result.append(&mut data);
967            result.resize(result.len().next_multiple_of(8), 0);
968        }
969
970        let end_ext = HeaderExtensionHeader {
971            extension_type: HeaderExtensionType::End as u32,
972            length: 0,
973        };
974        result.append(&mut encode_binary(&end_ext)?);
975        result.resize(result.len().next_multiple_of(8), 0);
976
977        Ok(result)
978    }
979
980    /// Helper for functions that just need to change little bits in the v2 header part.
981    async fn write_v2_header<S: Storage>(&self, image: &S) -> io::Result<()> {
982        let v2_header = encode_binary(&self.v2)?;
983        image.write(&v2_header, 0).await
984    }
985
986    /// Write the refcount table pointer (offset and size) to disk.
987    pub async fn write_reftable_pointer<S: Storage>(&self, image: &S) -> io::Result<()> {
988        // TODO: Just write the reftable offset and size
989        self.write_v2_header(image).await
990    }
991
992    /// Write the L1 table pointer (offset and size) to disk.
993    pub async fn write_l1_table_pointer<S: Storage>(&self, image: &S) -> io::Result<()> {
994        // TODO: Just write the L1 table offset and size
995        self.write_v2_header(image).await
996    }
997
998    /// Write the guest disk size to disk.
999    pub async fn write_size<S: Storage>(&self, image: &S) -> io::Result<()> {
1000        // TODO: Just write the size
1001        self.write_v2_header(image).await
1002    }
1003}
1004
1005impl HeaderExtension {
1006    /// Parse an extension from its type and data.  Unrecognized types are stored as `Unknown`
1007    /// extensions, encountering the end of extensions returns `Ok(None)`.
1008    fn deserialize(ext_type: u32, data: Vec<u8>) -> io::Result<Option<Self>> {
1009        let ext = if let Ok(ext_type) = HeaderExtensionType::try_from(ext_type) {
1010            match ext_type {
1011                HeaderExtensionType::End => return Ok(None),
1012                HeaderExtensionType::BackingFileFormat => {
1013                    let fmt = String::from_utf8(data).map_err(|err| {
1014                        invalid_data(format!("Invalid backing file format: {err}"))
1015                    })?;
1016                    HeaderExtension::BackingFileFormat(fmt)
1017                }
1018                HeaderExtensionType::FeatureNameTable => {
1019                    let mut feats = HashMap::new();
1020                    for feat in data.chunks(48) {
1021                        let feat_type: FeatureType = match feat[0].try_into() {
1022                            Ok(ft) => ft,
1023                            Err(_) => continue, // skip unrecognized entries
1024                        };
1025                        // Cannot use CStr to parse this, as it may not be NUL-terminated.
1026                        // Use this to remove everything from the first NUL byte.
1027                        let feat_name_bytes = feat[2..].split(|c| *c == 0).next().unwrap();
1028                        // Then just use it as a UTF-8 string.
1029                        let feat_name = String::from_utf8_lossy(feat_name_bytes);
1030                        feats.insert((feat_type, feat[1]), feat_name.to_string());
1031                    }
1032                    HeaderExtension::FeatureNameTable(feats)
1033                }
1034                HeaderExtensionType::ExternalDataFileName => {
1035                    let filename = String::from_utf8(data).map_err(|err| {
1036                        invalid_data(format!("Invalid external data file name: {err}"))
1037                    })?;
1038                    HeaderExtension::ExternalDataFileName(filename)
1039                }
1040            }
1041        } else {
1042            HeaderExtension::Unknown {
1043                extension_type: ext_type,
1044                data,
1045            }
1046        };
1047
1048        Ok(Some(ext))
1049    }
1050
1051    /// Return the extension type ID.
1052    fn extension_type(&self) -> u32 {
1053        match self {
1054            HeaderExtension::BackingFileFormat(_) => HeaderExtensionType::BackingFileFormat as u32,
1055            HeaderExtension::FeatureNameTable(_) => HeaderExtensionType::FeatureNameTable as u32,
1056            HeaderExtension::ExternalDataFileName(_) => {
1057                HeaderExtensionType::ExternalDataFileName as u32
1058            }
1059            HeaderExtension::Unknown {
1060                extension_type,
1061                data: _,
1062            } => *extension_type,
1063        }
1064    }
1065
1066    /// Serialize this extension’s data (exclusing its header).
1067    fn serialize_data(&self) -> io::Result<Vec<u8>> {
1068        match self {
1069            HeaderExtension::BackingFileFormat(fmt) => Ok(fmt.as_bytes().into()),
1070            HeaderExtension::FeatureNameTable(map) => {
1071                let mut result = Vec::new();
1072                for (bit, name) in map {
1073                    result.push(bit.0 as u8);
1074                    result.push(bit.1);
1075
1076                    let mut padded_name = vec![0; 46];
1077                    let name_bytes = name.as_bytes();
1078                    // Might truncate in the middle of a multibyte character, but getting that
1079                    // right is complicated and probably not worth it
1080                    let truncated_len = cmp::min(name_bytes.len(), 46);
1081                    padded_name[..truncated_len].copy_from_slice(&name_bytes[..truncated_len]);
1082                    result.extend_from_slice(&padded_name);
1083                }
1084                Ok(result)
1085            }
1086            HeaderExtension::ExternalDataFileName(filename) => Ok(filename.as_bytes().into()),
1087            HeaderExtension::Unknown {
1088                extension_type: _,
1089                data,
1090            } => Ok(data.clone()),
1091        }
1092    }
1093
1094    /// Creates a [`Self::FeatureNameTable`].
1095    fn feature_name_table() -> Self {
1096        use {AutoclearFeatures as A, CompatibleFeatures as C, IncompatibleFeatures as I};
1097
1098        let mut map = HashMap::new();
1099
1100        map.insert(I::Dirty.into(), "dirty".into());
1101        map.insert(I::Corrupt.into(), "corrupt".into());
1102        map.insert(I::ExternalDataFile.into(), "external data file".into());
1103        map.insert(
1104            I::CompressionType.into(),
1105            "extended compression type".into(),
1106        );
1107        map.insert(I::ExtendedL2Entries.into(), "extended L2 entries".into());
1108
1109        map.insert(C::LazyRefcounts.into(), "lazy refcounts".into());
1110
1111        map.insert(A::Bitmaps.into(), "persistent dirty bitmaps".into());
1112        map.insert(A::RawExternalData.into(), "raw external data file".into());
1113
1114        HeaderExtension::FeatureNameTable(map)
1115    }
1116}
1117
1118/// L1 table entry.
1119///
1120/// - Bit 0 - 8: Reserved (set to 0)
1121/// - Bit 9 – 55: Bits 9-55 of the offset into the image file at which the L2 table starts.  Must
1122///   be aligned to a cluster boundary.  If the offset is 0, the L2 table and all clusters
1123///   described by this L2 table are unallocated.
1124/// - Bit 56 - 62: Reserved (set to 0)
1125/// - Bit 63: 0 for an L2 table that is unused or requires COW, 1 if its refcount is exactly one.
1126///   This information is only accurate in the active L1 table.
1127#[derive(Copy, Clone, Default, Debug)]
1128pub(super) struct L1Entry(u64);
1129
1130impl L1Entry {
1131    /// Offset of the L2 table, if any.
1132    pub fn l2_offset(&self) -> Option<HostOffset> {
1133        let ofs = self.0 & 0x00ff_ffff_ffff_fe00u64;
1134        if ofs == 0 {
1135            None
1136        } else {
1137            Some(HostOffset(ofs))
1138        }
1139    }
1140
1141    /// Whether the L2 table’s cluster is “copied”.
1142    ///
1143    /// `true` means is refcount is one, `false` means modifying it will require COW.
1144    pub fn is_copied(&self) -> bool {
1145        self.0 & (1u64 << 63) != 0
1146    }
1147
1148    /// Return all reserved bits.
1149    pub fn reserved_bits(&self) -> u64 {
1150        self.0 & 0x7f00_0000_0000_01feu64
1151    }
1152}
1153
1154impl TableEntry for L1Entry {
1155    fn try_from_plain(value: u64, header: &Header) -> io::Result<Self> {
1156        let entry = L1Entry(value);
1157
1158        if entry.reserved_bits() != 0 {
1159            return Err(invalid_data(format!(
1160                "Invalid L1 entry 0x{value:x}, reserved bits set (0x{:x})",
1161                entry.reserved_bits(),
1162            )));
1163        }
1164
1165        if let Some(l2_ofs) = entry.l2_offset() {
1166            if l2_ofs.in_cluster_offset(header.cluster_bits()) != 0 {
1167                return Err(invalid_data(format!(
1168                    "Invalid L1 entry 0x{value:x}, offset ({l2_ofs}) is not aligned to cluster size (0x{:x})",
1169                    header.cluster_size(),
1170                )));
1171            }
1172        }
1173
1174        Ok(entry)
1175    }
1176
1177    fn to_plain(&self) -> u64 {
1178        self.0
1179    }
1180}
1181
1182/// L1 table.
1183#[derive(Debug)]
1184pub(super) struct L1Table {
1185    /// First cluster in the image file.
1186    cluster: Option<HostCluster>,
1187
1188    /// Table data.
1189    data: Box<[L1Entry]>,
1190
1191    /// log2 of the cluster size.
1192    cluster_bits: u32,
1193
1194    /// Whether this table has been modified since it was last written.
1195    modified: AtomicBool,
1196}
1197
1198impl L1Table {
1199    /// Create a clone that covers at least `at_least_index`.
1200    pub fn clone_and_grow(&self, at_least_index: usize, header: &Header) -> io::Result<Self> {
1201        let new_entry_count = cmp::max(at_least_index + 1, self.data.len());
1202        let new_entry_count =
1203            new_entry_count.next_multiple_of(header.cluster_size() / size_of::<L1Entry>());
1204
1205        if new_entry_count > <Self as Table>::MAX_ENTRIES {
1206            return Err(io::Error::other(
1207                "Cannot grow the image to this size; L1 table would become too big",
1208            ));
1209        }
1210
1211        let mut new_data = vec![L1Entry::default(); new_entry_count];
1212        new_data[..self.data.len()].copy_from_slice(&self.data);
1213
1214        Ok(Self {
1215            cluster: None,
1216            data: new_data.into_boxed_slice(),
1217            cluster_bits: header.cluster_bits(),
1218            modified: true.into(),
1219        })
1220    }
1221
1222    /// Check whether `index` is in bounds.
1223    pub fn in_bounds(&self, index: usize) -> bool {
1224        index < self.data.len()
1225    }
1226
1227    /// Enter the given L2 table into this L1 table.
1228    pub fn enter_l2_table(&mut self, index: usize, l2: &L2Table) -> io::Result<()> {
1229        let l2_offset = l2.get_offset().ok_or_else(|| {
1230            io::Error::new(
1231                io::ErrorKind::InvalidInput,
1232                "L2 table has no assigned offset",
1233            )
1234        })?;
1235
1236        let l1entry = L1Entry((1 << 63) | l2_offset.0);
1237        debug_assert!(l1entry.reserved_bits() == 0);
1238        self.data[index] = l1entry;
1239        self.modified.store(true, Ordering::Relaxed);
1240
1241        Ok(())
1242    }
1243}
1244
1245impl Table for L1Table {
1246    type InternalEntry = L1Entry;
1247    type Entry = L1Entry;
1248    const NAME: &'static str = "L1 table";
1249
1250    /// Maximum number of L1 table entries.
1251    ///
1252    /// Limit taken from QEMU; if QEMU rejects this, we can, too.
1253    const MAX_ENTRIES: usize = 4 * 1024 * 1024;
1254
1255    fn from_data(data: Box<[L1Entry]>, header: &Header) -> Self {
1256        Self {
1257            cluster: None,
1258            data,
1259            cluster_bits: header.cluster_bits(),
1260            modified: true.into(),
1261        }
1262    }
1263
1264    fn entries(&self) -> usize {
1265        self.data.len()
1266    }
1267
1268    fn get_ref(&self, index: usize) -> Option<&L1Entry> {
1269        self.data.get(index)
1270    }
1271
1272    fn get(&self, index: usize) -> L1Entry {
1273        self.data.get(index).copied().unwrap_or(L1Entry(0))
1274    }
1275
1276    fn get_cluster(&self) -> Option<HostCluster> {
1277        self.cluster
1278    }
1279
1280    fn get_offset(&self) -> Option<HostOffset> {
1281        self.cluster.map(|index| index.offset(self.cluster_bits))
1282    }
1283
1284    fn set_cluster(&mut self, cluster: HostCluster) {
1285        self.cluster = Some(cluster);
1286        self.modified.store(true, Ordering::Relaxed);
1287    }
1288
1289    fn unset_cluster(&mut self) {
1290        self.cluster = None;
1291    }
1292
1293    fn is_modified(&self) -> bool {
1294        self.modified.load(Ordering::Relaxed)
1295    }
1296
1297    fn clear_modified(&self) {
1298        self.modified.store(false, Ordering::Relaxed);
1299    }
1300
1301    fn set_modified(&self) {
1302        self.modified.store(true, Ordering::Relaxed);
1303    }
1304
1305    fn cluster_bits(&self) -> u32 {
1306        self.cluster_bits
1307    }
1308}
1309
1310/// L2 table entry.
1311///
1312/// - Bit 0 - 61: Cluster descriptor
1313/// - Bit 62: 0 for standard clusters, 1 for compressed clusters
1314/// - Bit 63: 0 for clusters that are unused, compressed or require COW.  1 for standard clusters
1315///   whose refcount is exactly one.  This information is only accurate in L2 tables that are
1316///   reachable from the active L1 table.  With external data files, all guest clusters have an
1317///   implicit refcount of 1 (because of the fixed host = guest mapping for guest cluster offsets),
1318///   so this bit should be 1 for all allocated clusters.
1319///
1320/// Standard Cluster Descriptor:
1321/// - Bit 0: If set to 1, the cluster reads as all zeros. The host cluster offset can be used to
1322///   describe a preallocation, but it won’t be used for reading data from this cluster, nor is
1323///   data read from the backing file if the cluster is unallocated.  With version 2 or with
1324///   extended L2 entries (see the next section), this is always 0.
1325/// - Bit 1 – 8: Reserved (set to 0)
1326/// - Bit 9 – 55: Bits 9-55 of host cluster offset. Must be aligned to a cluster boundary. If the
1327///   offset is 0 and bit 63 is clear, the cluster is unallocated. The offset may only be 0 with
1328///   bit 63 set (indicating a host cluster offset of 0) when an external data file is used.
1329/// - Bit 56 - 61: Reserved (set to 0)
1330///
1331/// Compressed Cluster Descriptor (`x = 62 - (cluster_bits - 8)`):
1332/// - Bit 0 - x-1: Host cluster offset.  This is usually _not_ aligned to a cluster or sector
1333///   boundary!  If cluster_bits is small enough that this field includes bits beyond 55, those
1334///   upper bits must be set to 0.
1335/// - Bit x - 61: Number of additional 512-byte sectors used for the compressed data, beyond the
1336///   sector containing the offset in the previous field. Some of these sectors may reside in the
1337///   next contiguous host cluster.  Note that the compressed data does not necessarily occupy all
1338///   of the bytes in the final sector; rather, decompression stops when it has produced a cluster
1339///   of data.  Another compressed cluster may map to the tail of the final sector used by this
1340///   compressed cluster.
1341#[derive(Copy, Clone, Default, Debug)]
1342pub(super) struct L2Entry(u64);
1343
1344/// Internal actual type of L2 entries.
1345///
1346/// Using atomic allows flushing L2 tables from the cache while they are write-locked.
1347#[derive(Default, Debug)]
1348pub(super) struct AtomicL2Entry(AtomicU64);
1349
1350/// High-level representation of an L2 entry.
1351#[derive(Debug, Clone)]
1352pub(super) enum L2Mapping {
1353    /// Data is in the data file.
1354    DataFile {
1355        /// Cluster in the data file.
1356        host_cluster: HostCluster,
1357
1358        /// Whether the cluster has a refcount of exactly 1.
1359        copied: bool,
1360    },
1361
1362    /// Data is in the backing file.
1363    Backing {
1364        /// Guest cluster index.
1365        backing_offset: u64,
1366    },
1367
1368    /// Data is zero.
1369    Zero {
1370        /// Preallocated cluster in the data file, if any.
1371        host_cluster: Option<HostCluster>,
1372
1373        /// Whether the preallocated cluster has a refcount of exactly 1.
1374        copied: bool,
1375    },
1376
1377    /// Data is compressed.
1378    Compressed {
1379        /// Offset in the data file.
1380        host_offset: HostOffset,
1381
1382        /// Upper limit on the number of bytes that comprise the compressed data.
1383        length: u64,
1384    },
1385}
1386
1387impl L2Entry {
1388    /// Offset of the data cluster, if any.
1389    ///
1390    /// Assumes the L2 entry references a data cluster, not a compressed cluster.
1391    ///
1392    /// `external_data_file` must be true when using an external data file; in this case, offset 0
1393    /// is a valid offset, and can only be distinguished from “unallocated” by whether the COPIED
1394    /// flag is set or not (which it always is when using an external data file).
1395    pub fn cluster_offset(&self, external_data_file: bool) -> Option<HostOffset> {
1396        let ofs = self.0 & 0x00ff_ffff_ffff_fe00u64;
1397        if ofs != 0 || (external_data_file && self.is_copied()) {
1398            Some(HostOffset(ofs))
1399        } else {
1400            None
1401        }
1402    }
1403
1404    /// Whether the cluster is compressed.
1405    pub fn is_compressed(&self) -> bool {
1406        self.0 & (1u64 << 62) != 0
1407    }
1408
1409    /// Whether the cluster is “copied”.
1410    ///
1411    /// `true` means is refcount is one, `false` means modifying it will require COW.
1412    pub fn is_copied(&self) -> bool {
1413        self.0 & (1u64 << 63) != 0
1414    }
1415
1416    /// Clear “copied” flag.
1417    #[must_use]
1418    pub fn without_copied(self) -> Self {
1419        L2Entry(self.0 & !(1u64 << 63))
1420    }
1421
1422    /// Whether the cluster is a zero cluster.
1423    ///
1424    /// Assumes the L2 entry references a data cluster, not a compressed cluster.
1425    pub fn is_zero(&self) -> bool {
1426        self.0 & (1u64 << 0) != 0
1427    }
1428
1429    /// Return all reserved bits.
1430    pub fn reserved_bits(&self) -> u64 {
1431        if self.is_compressed() {
1432            self.0 & 0x8000_0000_0000_0000u64
1433        } else {
1434            self.0 & 0x3f00_0000_0000_01feu64
1435        }
1436    }
1437
1438    /// Return the full compressed cluster descriptor.
1439    pub fn compressed_descriptor(&self) -> u64 {
1440        self.0 & 0x3fff_ffff_ffff_ffffu64
1441    }
1442
1443    /// If this entry is compressed, return the start host offset and upper limit on the compressed
1444    /// number of bytes.
1445    pub fn compressed_range(&self, cluster_bits: u32) -> Option<(HostOffset, u64)> {
1446        if self.is_compressed() {
1447            let desc = self.compressed_descriptor();
1448            let compressed_offset_bits = 62 - (cluster_bits - 8);
1449            let offset = desc & ((1 << compressed_offset_bits) - 1) & 0x00ff_ffff_ffff_ffffu64;
1450            let sectors = desc >> compressed_offset_bits;
1451            // The first sector is not considered in `sectors`, so we add it and subtract the
1452            // number of bytes there that do not belong to this compressed cluster
1453            let length = (sectors + 1) * 512 - (offset & 511);
1454
1455            Some((HostOffset(offset), length))
1456        } else {
1457            None
1458        }
1459    }
1460
1461    /// If this entry is allocated, return the first host cluster and the number of clusters it
1462    /// references.
1463    ///
1464    /// `external_data_file` must be true when using an external data file.
1465    fn allocation(
1466        &self,
1467        cluster_bits: u32,
1468        external_data_file: bool,
1469    ) -> Option<(HostCluster, ClusterCount)> {
1470        if let Some((offset, length)) = self.compressed_range(cluster_bits) {
1471            // Compressed clusters can cross host cluster boundaries, and thus occupy two clusters
1472            let first_cluster = offset.cluster(cluster_bits);
1473            let cluster_count = ClusterCount::from_byte_size(
1474                offset + length - first_cluster.offset(cluster_bits),
1475                cluster_bits,
1476            );
1477            Some((first_cluster, cluster_count))
1478        } else {
1479            self.cluster_offset(external_data_file)
1480                .map(|ofs| (ofs.cluster(cluster_bits), ClusterCount(1)))
1481        }
1482    }
1483
1484    /// Return the high-level `L2Mapping` representation.
1485    ///
1486    /// `guest_cluster` is the guest cluster being accessed, `cluster_bits` is log2 of the cluster
1487    /// size.  `external_data_file` must be true when using an external data file.
1488    fn into_mapping(
1489        self,
1490        guest_cluster: GuestCluster,
1491        cluster_bits: u32,
1492        external_data_file: bool,
1493    ) -> io::Result<L2Mapping> {
1494        let mapping = if let Some((offset, length)) = self.compressed_range(cluster_bits) {
1495            L2Mapping::Compressed {
1496                host_offset: offset,
1497                length,
1498            }
1499        } else if self.is_zero() {
1500            let host_cluster = self
1501                .cluster_offset(external_data_file)
1502                .map(|ofs| {
1503                    ofs.checked_cluster(cluster_bits).ok_or_else(|| {
1504                        let offset = guest_cluster.offset(cluster_bits);
1505                        io::Error::other(format!(
1506                            "Unaligned pre-allocated zero cluster at {offset}; L2 entry: {self:?}"
1507                        ))
1508                    })
1509                })
1510                .transpose()?;
1511
1512            L2Mapping::Zero {
1513                host_cluster,
1514                copied: host_cluster.is_some() && self.is_copied(),
1515            }
1516        } else if let Some(host_offset) = self.cluster_offset(external_data_file) {
1517            let host_cluster = host_offset.checked_cluster(cluster_bits).ok_or_else(|| {
1518                let offset = guest_cluster.offset(cluster_bits);
1519                io::Error::other(format!(
1520                    "Unaligned data cluster at {offset}; L2 entry: {self:?}"
1521                ))
1522            })?;
1523
1524            L2Mapping::DataFile {
1525                host_cluster,
1526                copied: self.is_copied(),
1527            }
1528        } else {
1529            L2Mapping::Backing {
1530                backing_offset: guest_cluster.offset(cluster_bits).0,
1531            }
1532        };
1533
1534        Ok(mapping)
1535    }
1536
1537    /// Create an L2 entry from its high-level `L2Mapping` representation.
1538    fn from_mapping(value: L2Mapping, cluster_bits: u32) -> Self {
1539        let num_val: u64 = match value {
1540            L2Mapping::DataFile {
1541                host_cluster,
1542                copied,
1543            } => {
1544                debug_assert!(host_cluster.offset(cluster_bits) <= MAX_OFFSET);
1545                if copied {
1546                    (1 << 63) | host_cluster.offset(cluster_bits).0
1547                } else {
1548                    host_cluster.offset(cluster_bits).0
1549                }
1550            }
1551
1552            L2Mapping::Backing { backing_offset: _ } => 0,
1553
1554            L2Mapping::Zero {
1555                host_cluster,
1556                copied,
1557            } => {
1558                let host_offset = host_cluster.map(|hc| hc.offset(cluster_bits));
1559                debug_assert!(host_offset.unwrap_or(HostOffset(0)) <= MAX_OFFSET);
1560                if copied {
1561                    (1 << 63) | host_offset.unwrap().0 | 0x1
1562                } else {
1563                    host_offset.unwrap_or(HostOffset(0)).0 | 0x1
1564                }
1565            }
1566
1567            L2Mapping::Compressed {
1568                host_offset,
1569                length,
1570            } => {
1571                let compressed_offset_bits = 62 - (cluster_bits - 8);
1572                assert!(length < 1 << cluster_bits);
1573                assert!(host_offset.0 < 1 << compressed_offset_bits);
1574
1575                // The first sector is not considered, so we subtract the number of bytes in it
1576                // that belong to this compressed cluster from `length`:
1577                // ceil((length - (512 - (host_offset & 511))) / 512)
1578                // = (length + 511 - 512 + (host_offset & 511)) / 512
1579                let sectors = (length - 1 + (host_offset.0 & 511)) / 512;
1580
1581                (1 << 62) | (sectors << compressed_offset_bits) | host_offset.0
1582            }
1583        };
1584
1585        let entry = L2Entry(num_val);
1586        debug_assert!(entry.reserved_bits() == 0);
1587        entry
1588    }
1589}
1590
1591impl AtomicL2Entry {
1592    /// Get the contained value.
1593    fn get(&self) -> L2Entry {
1594        L2Entry(self.0.load(Ordering::Relaxed))
1595    }
1596
1597    /// Exchange the contained value.
1598    ///
1599    /// # Safety
1600    /// Caller must ensure that:
1601    /// (1) No reader sees invalid intermediate states.
1602    /// (2) Updates are done atomically (do not depend on prior state of the L2 table), or there is
1603    ///     only one writer at a time.
1604    unsafe fn swap(&self, l2e: L2Entry) -> L2Entry {
1605        L2Entry(self.0.swap(l2e.0, Ordering::Relaxed))
1606    }
1607}
1608
1609impl TableEntry for AtomicL2Entry {
1610    fn try_from_plain(value: u64, header: &Header) -> io::Result<Self> {
1611        let entry = L2Entry(value);
1612
1613        if entry.reserved_bits() != 0 {
1614            return Err(invalid_data(format!(
1615                "Invalid L2 entry 0x{value:x}, reserved bits set (0x{:x})",
1616                entry.reserved_bits(),
1617            )));
1618        }
1619
1620        if let Some(offset) = entry.cluster_offset(header.external_data_file()) {
1621            if !entry.is_compressed() && offset.in_cluster_offset(header.cluster_bits()) != 0 {
1622                return Err(invalid_data(format!(
1623                    "Invalid L2 entry 0x{value:x}, offset ({offset}) is not aligned to cluster size (0x{:x})",
1624                    header.cluster_size(),
1625                )));
1626            }
1627        }
1628
1629        Ok(AtomicL2Entry(AtomicU64::new(entry.0)))
1630    }
1631
1632    fn to_plain(&self) -> u64 {
1633        self.get().0
1634    }
1635}
1636
1637impl L2Mapping {
1638    /// Check whether two mappings are consecutive.
1639    ///
1640    /// Given the `preceding` mapping, check whether `self` is consecutive to it, i.e. is the same
1641    /// kind of mapping, and the offsets are consecutive.
1642    pub fn is_consecutive(&self, preceding: &L2Mapping, cluster_bits: u32) -> bool {
1643        match preceding {
1644            L2Mapping::DataFile {
1645                host_cluster: prior_cluster,
1646                copied,
1647            } => {
1648                if let L2Mapping::DataFile {
1649                    host_cluster: next_cluster,
1650                    copied: next_copied,
1651                } = self
1652                {
1653                    *next_cluster == *prior_cluster + ClusterCount(1) && *next_copied == *copied
1654                } else {
1655                    false
1656                }
1657            }
1658
1659            L2Mapping::Backing {
1660                backing_offset: prior_backing_offset,
1661            } => {
1662                let Some(expected_next) = prior_backing_offset.checked_add(1 << cluster_bits)
1663                else {
1664                    return false;
1665                };
1666
1667                if let L2Mapping::Backing {
1668                    backing_offset: next_offset,
1669                } = self
1670                {
1671                    *next_offset == expected_next
1672                } else {
1673                    false
1674                }
1675            }
1676
1677            L2Mapping::Zero {
1678                host_cluster: _,
1679                copied: _,
1680            } => {
1681                // Cluster and copied do not matter; every read is continuous regardless (always
1682                // zero), and every write is, too (always allocate)
1683                matches!(
1684                    self,
1685                    L2Mapping::Zero {
1686                        host_cluster: _,
1687                        copied: _,
1688                    }
1689                )
1690            }
1691
1692            L2Mapping::Compressed {
1693                host_offset: _,
1694                length: _,
1695            } => {
1696                // Not really true, but in practice it is.  Reads need to go through a special
1697                // function anyway, and every write will need COW anyway.
1698                matches!(
1699                    self,
1700                    L2Mapping::Compressed {
1701                        host_offset: _,
1702                        length: _,
1703                    }
1704                )
1705            }
1706        }
1707    }
1708}
1709
1710/// L2 table.
1711#[derive(Debug)]
1712pub(super) struct L2Table {
1713    /// Cluster of the L2 table.
1714    cluster: Option<HostCluster>,
1715
1716    /// Table data.
1717    data: Box<[AtomicL2Entry]>,
1718
1719    /// log2 of the cluster size.
1720    cluster_bits: u32,
1721
1722    /// Whether this image uses an external data file.
1723    external_data_file: bool,
1724
1725    /// Whether this table has been modified since it was last written.
1726    modified: AtomicBool,
1727
1728    /// Lock for creating `L2TableWriteGuard`.
1729    writer_lock: Mutex<()>,
1730}
1731
1732/// Write guard for an L2 table.
1733#[derive(Debug)]
1734pub(super) struct L2TableWriteGuard<'a> {
1735    /// Referenced L2 table.
1736    table: &'a L2Table,
1737
1738    /// Held guard mutex on that L2 table.
1739    _lock: MutexGuard<'a, ()>,
1740}
1741
1742impl L2Table {
1743    /// Create a new zeroed L2 table.
1744    pub fn new_cleared(header: &Header) -> Self {
1745        let mut data = Vec::with_capacity(header.l2_entries());
1746        data.resize_with(header.l2_entries(), Default::default);
1747
1748        L2Table {
1749            cluster: None,
1750            data: data.into_boxed_slice(),
1751            cluster_bits: header.cluster_bits(),
1752            external_data_file: header.external_data_file(),
1753            modified: true.into(),
1754            writer_lock: Default::default(),
1755        }
1756    }
1757
1758    /// Look up a cluster mapping.
1759    pub fn get_mapping(&self, lookup_cluster: GuestCluster) -> io::Result<L2Mapping> {
1760        self.get(lookup_cluster.l2_index(self.cluster_bits))
1761            .into_mapping(lookup_cluster, self.cluster_bits, self.external_data_file)
1762    }
1763
1764    /// Allow modifying this L2 table.
1765    ///
1766    /// Note that readers are allowed to exist while modifications are happening.
1767    pub async fn lock_write(&self) -> L2TableWriteGuard<'_> {
1768        L2TableWriteGuard {
1769            table: self,
1770            _lock: self.writer_lock.lock().await,
1771        }
1772    }
1773}
1774
1775impl L2TableWriteGuard<'_> {
1776    /// Look up a cluster mapping.
1777    pub fn get_mapping(&self, lookup_cluster: GuestCluster) -> io::Result<L2Mapping> {
1778        self.table.get_mapping(lookup_cluster)
1779    }
1780
1781    /// Enter the given raw data cluster mapping into the L2 table.
1782    ///
1783    /// If the previous entry pointed to an allocated cluster, return the old allocation so its
1784    /// refcount can be decreased (offset of the first cluster and number of clusters -- compressed
1785    /// clusters can span across host cluster boundaries).
1786    ///
1787    /// If the allocation is reused, `None` is returned, so this function only returns `Some(_)` if
1788    /// some cluster is indeed leaked.
1789    #[must_use = "Leaked allocation must be freed"]
1790    pub fn map_cluster(
1791        &mut self,
1792        index: usize,
1793        host_cluster: HostCluster,
1794    ) -> Option<(HostCluster, ClusterCount)> {
1795        let new = L2Entry::from_mapping(
1796            L2Mapping::DataFile {
1797                host_cluster,
1798                copied: true,
1799            },
1800            self.table.cluster_bits,
1801        );
1802        // Safe: We set a full valid mapping, and there is only one writer (thanks to
1803        // `L2TableWriteGuard`).
1804        let l2e = unsafe { self.table.data[index].swap(new) };
1805        self.table.modified.store(true, Ordering::Relaxed);
1806
1807        let allocation = l2e.allocation(self.table.cluster_bits, self.table.external_data_file);
1808        if let Some((a_cluster, a_count)) = allocation {
1809            if a_cluster == host_cluster && a_count == ClusterCount(1) {
1810                None
1811            } else {
1812                allocation
1813            }
1814        } else {
1815            None
1816        }
1817    }
1818
1819    /// Make the given index a zero mapping.
1820    ///
1821    /// If `keep_allocation` is true, keep the zero cluster pre-allocated if there is a
1822    /// pre-existing single-cluster allocation (i.e. data cluster or pre-allocated zero cluster).
1823    /// Otherwise, the existing mapping is discarded.
1824    ///
1825    /// If a previous mapping is discarded, return the old allocation so its refcount can be
1826    /// decreased (offset of the first cluster and number of clusters -- compressed clusters can
1827    /// span across host cluster boundaries).
1828    #[must_use = "Leaked allocation must be freed"]
1829    pub fn zero_cluster(
1830        &mut self,
1831        index: usize,
1832        keep_allocation: bool,
1833    ) -> io::Result<Option<(HostCluster, ClusterCount)>> {
1834        let cluster_copied = if keep_allocation {
1835            match self.table.data[index].get().into_mapping(
1836                GuestCluster(0), // only used for backing, which we ignore
1837                self.table.cluster_bits,
1838                self.table.external_data_file,
1839            )? {
1840                L2Mapping::DataFile {
1841                    host_cluster,
1842                    copied,
1843                } => Some((host_cluster, copied)),
1844                L2Mapping::Backing { backing_offset: _ } => None,
1845                L2Mapping::Zero {
1846                    host_cluster: Some(host_cluster),
1847                    copied,
1848                } => Some((host_cluster, copied)),
1849                L2Mapping::Zero {
1850                    host_cluster: None,
1851                    copied: _,
1852                } => None,
1853                L2Mapping::Compressed {
1854                    host_offset: _,
1855                    length: _,
1856                } => None,
1857            }
1858        } else {
1859            None
1860        };
1861
1862        let retained = cluster_copied.is_some();
1863        let new = if let Some((cluster, copied)) = cluster_copied {
1864            L2Mapping::Zero {
1865                host_cluster: Some(cluster),
1866                copied,
1867            }
1868        } else {
1869            L2Mapping::Zero {
1870                host_cluster: None,
1871                copied: false,
1872            }
1873        };
1874        let new = L2Entry::from_mapping(new, self.table.cluster_bits);
1875
1876        // Safe: We set a full valid mapping, and there is only one writer (thanks to
1877        // `L2TableWriteGuard`).
1878        let old = unsafe { self.table.data[index].swap(new) };
1879        self.table.modified.store(true, Ordering::Relaxed);
1880
1881        let leaked = if !retained {
1882            old.allocation(self.table.cluster_bits, self.table.external_data_file)
1883        } else {
1884            None
1885        };
1886        Ok(leaked)
1887    }
1888
1889    /// Remove the given mapping, leaving it empty.
1890    ///
1891    /// If a previous mapping is discarded, return the old allocation so its refcount can be
1892    /// decreased (offset of the first cluster and number of clusters -- compressed clusters can
1893    /// span across host cluster boundaries).
1894    #[must_use = "Leaked allocation must be freed"]
1895    pub fn discard_cluster(&mut self, index: usize) -> Option<(HostCluster, ClusterCount)> {
1896        let new = L2Entry(0);
1897
1898        // Safe: We set a full valid mapping, and there is only one writer (thanks to
1899        // `L2TableWriteGuard`).
1900        let old = unsafe { self.table.data[index].swap(new) };
1901        self.table.modified.store(true, Ordering::Relaxed);
1902
1903        old.allocation(self.table.cluster_bits, self.table.external_data_file)
1904    }
1905}
1906
1907impl Table for L2Table {
1908    type InternalEntry = AtomicL2Entry;
1909    type Entry = L2Entry;
1910    const NAME: &'static str = "L2 table";
1911    const MAX_ENTRIES: usize = MAX_CLUSTER_SIZE / 8;
1912
1913    fn from_data(data: Box<[AtomicL2Entry]>, header: &Header) -> Self {
1914        assert!(data.len() == header.l2_entries());
1915
1916        Self {
1917            cluster: None,
1918            data,
1919            cluster_bits: header.cluster_bits(),
1920            external_data_file: header.external_data_file(),
1921            modified: true.into(),
1922            writer_lock: Default::default(),
1923        }
1924    }
1925
1926    fn entries(&self) -> usize {
1927        self.data.len()
1928    }
1929
1930    fn get_ref(&self, index: usize) -> Option<&AtomicL2Entry> {
1931        self.data.get(index)
1932    }
1933
1934    fn get(&self, index: usize) -> L2Entry {
1935        self.data
1936            .get(index)
1937            .map(|l2e| l2e.get())
1938            .unwrap_or(L2Entry(0))
1939    }
1940
1941    fn get_cluster(&self) -> Option<HostCluster> {
1942        self.cluster
1943    }
1944
1945    fn get_offset(&self) -> Option<HostOffset> {
1946        self.cluster.map(|index| index.offset(self.cluster_bits))
1947    }
1948
1949    fn set_cluster(&mut self, cluster: HostCluster) {
1950        self.cluster = Some(cluster);
1951        self.modified.store(true, Ordering::Relaxed);
1952    }
1953
1954    fn unset_cluster(&mut self) {
1955        self.cluster = None;
1956    }
1957
1958    fn is_modified(&self) -> bool {
1959        self.modified.load(Ordering::Relaxed)
1960    }
1961
1962    fn clear_modified(&self) {
1963        self.modified.store(false, Ordering::Relaxed);
1964    }
1965
1966    fn set_modified(&self) {
1967        self.modified.store(true, Ordering::Relaxed);
1968    }
1969
1970    fn cluster_bits(&self) -> u32 {
1971        self.cluster_bits
1972    }
1973}
1974
1975impl Clone for L2Table {
1976    fn clone(&self) -> Self {
1977        let mut data = Vec::with_capacity(self.data.len());
1978        for entry in &self.data {
1979            // None of these can be `copied`
1980            let entry = entry.get().without_copied();
1981            data.push(AtomicL2Entry(AtomicU64::new(entry.0)));
1982        }
1983
1984        let modified = AtomicBool::new(self.is_modified());
1985
1986        L2Table {
1987            cluster: None,
1988            data: data.into_boxed_slice(),
1989            cluster_bits: self.cluster_bits,
1990            external_data_file: self.external_data_file,
1991            modified,
1992            writer_lock: Default::default(),
1993        }
1994    }
1995}
1996
1997impl Drop for L2Table {
1998    fn drop(&mut self) {
1999        if self.is_modified() {
2000            error!("L2 table dropped while modified; was the image closed before being flushed?");
2001        }
2002    }
2003}
2004
2005/// Refcount table entry.
2006#[derive(Copy, Clone, Default, Debug)]
2007pub(super) struct RefTableEntry(u64);
2008
2009impl RefTableEntry {
2010    /// Offset of the referenced refblock, if any.
2011    pub fn refblock_offset(&self) -> Option<HostOffset> {
2012        let ofs = self.0 & 0xffff_ffff_ffff_fe00u64;
2013        if ofs == 0 {
2014            None
2015        } else {
2016            Some(HostOffset(ofs))
2017        }
2018    }
2019
2020    /// Return all reserved bits.
2021    pub fn reserved_bits(&self) -> u64 {
2022        self.0 & 0x0000_0000_0000_01ffu64
2023    }
2024}
2025
2026impl TableEntry for RefTableEntry {
2027    fn try_from_plain(value: u64, header: &Header) -> io::Result<Self> {
2028        let entry = RefTableEntry(value);
2029
2030        if entry.reserved_bits() != 0 {
2031            return Err(invalid_data(format!(
2032                "Invalid reftable entry 0x{value:x}, reserved bits set (0x{:x})",
2033                entry.reserved_bits(),
2034            )));
2035        }
2036
2037        if let Some(rb_ofs) = entry.refblock_offset() {
2038            if rb_ofs.in_cluster_offset(header.cluster_bits()) != 0 {
2039                return Err(invalid_data(
2040                    format!(
2041                        "Invalid reftable entry 0x{value:x}, offset ({rb_ofs}) is not aligned to cluster size (0x{:x})",
2042                        header.cluster_size(),
2043                    ),
2044                ));
2045            }
2046        }
2047
2048        Ok(entry)
2049    }
2050
2051    fn to_plain(&self) -> u64 {
2052        self.0
2053    }
2054}
2055
2056/// Refcount table.
2057#[derive(Debug)]
2058pub(super) struct RefTable {
2059    /// First cluster in the image file.
2060    cluster: Option<HostCluster>,
2061
2062    /// Table data.
2063    data: Box<[RefTableEntry]>,
2064
2065    /// log2 of the cluster size.
2066    cluster_bits: u32,
2067
2068    /// Whether this table has been modified since it was last written.
2069    modified: AtomicBool,
2070}
2071
2072impl RefTable {
2073    /// Create a clone that covers at least `at_least_index`.
2074    ///
2075    /// Also ensure that beyond `at_least_index`, there are enough entries to self-describe the new
2076    /// refcount table (so that it can actually be allocated).
2077    pub fn clone_and_grow(&self, header: &Header, at_least_index: usize) -> io::Result<Self> {
2078        let cluster_size = header.cluster_size();
2079        let rb_entries = header.rb_entries();
2080
2081        // There surely is an optimal O(1) solution, but probably would look less clear, and this
2082        // is not a hot path.
2083        let mut extra_rbs = 1;
2084        let new_entry_count = loop {
2085            let entry_count = cmp::max(at_least_index + 1 + extra_rbs, self.data.len());
2086            let entry_count = entry_count.next_multiple_of(cluster_size / size_of::<u64>());
2087            let size = entry_count * size_of::<u64>();
2088            // Full number of clusters needed to both the new reftable *and* the `extra_rbs`
2089            let refcount_clusters = size / cluster_size + extra_rbs;
2090            let rbs_needed = refcount_clusters.div_ceil(rb_entries);
2091            if extra_rbs == rbs_needed {
2092                break entry_count;
2093            }
2094            extra_rbs = rbs_needed;
2095        };
2096
2097        if new_entry_count > <Self as Table>::MAX_ENTRIES {
2098            return Err(io::Error::other(
2099                "Cannot grow the image to this size; refcount table would become too big",
2100            ));
2101        }
2102
2103        let mut new_data = vec![RefTableEntry::default(); new_entry_count];
2104        new_data[..self.data.len()].copy_from_slice(&self.data);
2105
2106        Ok(Self {
2107            cluster: None,
2108            data: new_data.into_boxed_slice(),
2109            cluster_bits: header.cluster_bits(),
2110            modified: true.into(),
2111        })
2112    }
2113
2114    /// Check whether `index` is in bounds.
2115    pub fn in_bounds(&self, index: usize) -> bool {
2116        index < self.data.len()
2117    }
2118
2119    /// Enter the given refcount block into this refcount table.
2120    pub fn enter_refblock(&mut self, index: usize, rb: &RefBlock) -> io::Result<()> {
2121        let rb_offset = rb.get_offset().ok_or_else(|| {
2122            io::Error::new(
2123                io::ErrorKind::InvalidInput,
2124                "Refcount block as no assigned offset",
2125            )
2126        })?;
2127
2128        let rt_entry = RefTableEntry(rb_offset.0);
2129        debug_assert!(rt_entry.reserved_bits() == 0);
2130        self.data[index] = rt_entry;
2131        self.modified.store(true, Ordering::Relaxed);
2132
2133        Ok(())
2134    }
2135}
2136
2137impl Table for RefTable {
2138    type InternalEntry = RefTableEntry;
2139    type Entry = RefTableEntry;
2140    const NAME: &'static str = "Refcount table";
2141
2142    /// Maximum number of refcount table entries.
2143    ///
2144    /// Not in QEMU, but makes sense to limit to the same as the L1 table.  Note that refcount
2145    /// blocks usually cover more clusters than an L2 table, so this generally allows larger image
2146    /// files than would be necessary for the maximum guest disk size determined by the maximum
2147    /// number of L1 entries.
2148    const MAX_ENTRIES: usize = <L1Table as Table>::MAX_ENTRIES;
2149
2150    fn from_data(data: Box<[RefTableEntry]>, header: &Header) -> Self {
2151        Self {
2152            cluster: None,
2153            data,
2154            cluster_bits: header.cluster_bits(),
2155            modified: true.into(),
2156        }
2157    }
2158
2159    fn entries(&self) -> usize {
2160        self.data.len()
2161    }
2162
2163    fn get_ref(&self, index: usize) -> Option<&RefTableEntry> {
2164        self.data.get(index)
2165    }
2166
2167    fn get(&self, index: usize) -> RefTableEntry {
2168        self.data.get(index).copied().unwrap_or(RefTableEntry(0))
2169    }
2170
2171    fn get_cluster(&self) -> Option<HostCluster> {
2172        self.cluster
2173    }
2174
2175    fn get_offset(&self) -> Option<HostOffset> {
2176        self.cluster.map(|index| index.offset(self.cluster_bits))
2177    }
2178
2179    fn set_cluster(&mut self, cluster: HostCluster) {
2180        self.cluster = Some(cluster);
2181        self.modified.store(true, Ordering::Relaxed);
2182    }
2183
2184    fn unset_cluster(&mut self) {
2185        self.cluster = None;
2186    }
2187
2188    fn is_modified(&self) -> bool {
2189        self.modified.load(Ordering::Relaxed)
2190    }
2191
2192    fn clear_modified(&self) {
2193        self.modified.store(false, Ordering::Relaxed);
2194    }
2195
2196    fn set_modified(&self) {
2197        self.modified.store(true, Ordering::Relaxed);
2198    }
2199
2200    fn cluster_bits(&self) -> u32 {
2201        self.cluster_bits
2202    }
2203}
2204
2205/// Refcount block.
2206pub(super) struct RefBlock {
2207    /// Cluster in the image file.
2208    cluster: Option<HostCluster>,
2209
2210    /// Raw table data (big endian).
2211    raw_data: IoBuffer,
2212
2213    /// log2 of the refcount bits.
2214    refcount_order: u32,
2215
2216    /// log2 of the cluster size.
2217    cluster_bits: u32,
2218
2219    /// Whether this block has been modified since it was last written.
2220    modified: AtomicBool,
2221
2222    /// Lock for creating `RefBlockWriteGuard`.
2223    writer_lock: Mutex<()>,
2224}
2225
2226/// Write guard for a refblock.
2227pub(super) struct RefBlockWriteGuard<'a> {
2228    /// Referenced refblock.
2229    rb: &'a RefBlock,
2230
2231    /// Held guard mutex on that refblock.
2232    _lock: MutexGuard<'a, ()>,
2233}
2234
2235impl RefBlock {
2236    /// Create a new zeroed refcount block.
2237    pub fn new_cleared<S: Storage>(for_image: &S, header: &Header) -> io::Result<Self> {
2238        let mut raw_data = IoBuffer::new(header.cluster_size(), for_image.mem_align())?;
2239        raw_data.as_mut().into_slice().fill(0);
2240
2241        Ok(RefBlock {
2242            cluster: None,
2243            raw_data,
2244            refcount_order: header.refcount_order(),
2245            cluster_bits: header.cluster_bits(),
2246            modified: true.into(),
2247            writer_lock: Default::default(),
2248        })
2249    }
2250
2251    /// Load a refcount block from disk.
2252    pub async fn load<S: Storage>(
2253        image: &S,
2254        header: &Header,
2255        cluster: HostCluster,
2256    ) -> io::Result<Self> {
2257        let cluster_bits = header.cluster_bits();
2258        let cluster_size = 1 << cluster_bits;
2259        let refcount_order = header.refcount_order();
2260        let offset = cluster.offset(cluster_bits);
2261
2262        check_table(
2263            "Refcount block",
2264            offset.0,
2265            cluster_size,
2266            1,
2267            MAX_CLUSTER_SIZE,
2268            cluster_size,
2269        )?;
2270
2271        let mut raw_data =
2272            IoBuffer::new(cluster_size, cmp::max(image.mem_align(), size_of::<u64>()))?;
2273        image.read(&mut raw_data, offset.0).await?;
2274
2275        Ok(RefBlock {
2276            cluster: Some(cluster),
2277            raw_data,
2278            refcount_order,
2279            cluster_bits,
2280            modified: false.into(),
2281            writer_lock: Default::default(),
2282        })
2283    }
2284
2285    /// Write a refcount block to disk.
2286    pub async fn write<S: Storage>(&self, image: &S) -> io::Result<()> {
2287        let offset = self
2288            .get_offset()
2289            .ok_or_else(|| io::Error::other("Cannot write qcow2 refcount block, no offset set"))?;
2290
2291        self.clear_modified();
2292        if let Err(err) = image.write(self.raw_data.as_ref(), offset.0).await {
2293            self.set_modified();
2294            return Err(err);
2295        }
2296
2297        Ok(())
2298    }
2299
2300    /// Get the block’s cluster in the image file.
2301    pub fn get_cluster(&self) -> Option<HostCluster> {
2302        self.cluster
2303    }
2304
2305    /// Get the block’s offset in the image file.
2306    pub fn get_offset(&self) -> Option<HostOffset> {
2307        self.cluster.map(|index| index.offset(self.cluster_bits))
2308    }
2309
2310    /// Change the block’s cluster in the image file (for writing).
2311    pub fn set_cluster(&mut self, cluster: HostCluster) {
2312        self.cluster = Some(cluster);
2313        self.set_modified();
2314    }
2315
2316    /// Calculate sub-byte refcount access parameters.
2317    ///
2318    /// For a given refcount index, return its:
2319    /// - byte index,
2320    /// - access mask,
2321    /// - in-byte shift.
2322    fn sub_byte_refcount_access(&self, index: usize) -> (usize, u8, usize) {
2323        let order = self.refcount_order;
2324        debug_assert!(order < 3);
2325
2326        // Note that `order` is in bits, i.e. `1 << order` is the number of bits.  `index` is in
2327        // units of refcounts, so `index << order` is the bit index, and `index << (order - 3)` is
2328        // then the byte index, which is equal to `index >> (3 - order)`.
2329        let byte_index = index >> (3 - order);
2330        // `1 << order` is the bits per refcount (bprc), so `(1 << bprc) - 1` is the mask for one
2331        // refcount (its maximum value).
2332        let mask = (1 << (1 << order)) - 1;
2333        // `index` is in units of refcounts, so `index << order` is the bit index.  `% 8`, we get
2334        // the base index inside of a byte.
2335        let shift = (index << order) % 8;
2336
2337        (byte_index, mask, shift)
2338    }
2339
2340    /// Get the given cluster’s refcount.
2341    pub fn get(&self, index: usize) -> u64 {
2342        match self.refcount_order {
2343            // refcount_bits == 1, 2, 4
2344            0..=2 => {
2345                let (index, mask, shift) = self.sub_byte_refcount_access(index);
2346                let raw_data_slice = unsafe { self.raw_data.as_ref().into_typed_slice::<u8>() };
2347                let atomic =
2348                    unsafe { AtomicU8::from_ptr(&raw_data_slice[index] as *const u8 as *mut u8) };
2349                ((atomic.load(Ordering::Relaxed) >> shift) & mask) as u64
2350            }
2351
2352            // refcount_bits == 8
2353            3 => {
2354                let raw_data_slice = unsafe { self.raw_data.as_ref().into_typed_slice::<u8>() };
2355                let atomic =
2356                    unsafe { AtomicU8::from_ptr(&raw_data_slice[index] as *const u8 as *mut u8) };
2357                atomic.load(Ordering::Relaxed) as u64
2358            }
2359
2360            // refcount_bits == 16
2361            4 => {
2362                let raw_data_slice = unsafe { self.raw_data.as_ref().into_typed_slice::<u16>() };
2363                let atomic = unsafe {
2364                    AtomicU16::from_ptr(&raw_data_slice[index] as *const u16 as *mut u16)
2365                };
2366                u16::from_be(atomic.load(Ordering::Relaxed)) as u64
2367            }
2368
2369            // refcount_bits == 32
2370            5 => {
2371                let raw_data_slice = unsafe { self.raw_data.as_ref().into_typed_slice::<u32>() };
2372                let atomic = unsafe {
2373                    AtomicU32::from_ptr(&raw_data_slice[index] as *const u32 as *mut u32)
2374                };
2375                u32::from_be(atomic.load(Ordering::Relaxed)) as u64
2376            }
2377
2378            // refcount_bits == 64
2379            6 => {
2380                let raw_data_slice = unsafe { self.raw_data.as_ref().into_typed_slice::<u64>() };
2381                let atomic = unsafe {
2382                    AtomicU64::from_ptr(&raw_data_slice[index] as *const u64 as *mut u64)
2383                };
2384                u64::from_be(atomic.load(Ordering::Relaxed))
2385            }
2386
2387            _ => unreachable!(),
2388        }
2389    }
2390
2391    /// Allow modifying this refcount block.
2392    ///
2393    /// Note that readers are allowed to exist while modifications are happening.
2394    pub async fn lock_write(&self) -> RefBlockWriteGuard<'_> {
2395        RefBlockWriteGuard {
2396            rb: self,
2397            _lock: self.writer_lock.lock().await,
2398        }
2399    }
2400
2401    /// Check whether this block has been modified since it was last written.
2402    pub fn is_modified(&self) -> bool {
2403        self.modified.load(Ordering::Relaxed)
2404    }
2405
2406    /// Clear the modified flag.
2407    pub fn clear_modified(&self) {
2408        self.modified.store(false, Ordering::Relaxed);
2409    }
2410
2411    /// Set the modified flag.
2412    pub fn set_modified(&self) {
2413        self.modified.store(true, Ordering::Relaxed);
2414    }
2415
2416    /// Check whether the given cluster’s refcount is 0.
2417    pub fn is_zero(&self, index: usize) -> bool {
2418        self.get(index) == 0
2419    }
2420}
2421
2422impl RefBlockWriteGuard<'_> {
2423    /// # Safety
2424    /// Caller must ensure there are no concurrent writers.
2425    unsafe fn fetch_update_bitset(
2426        bitset: &AtomicU8,
2427        change: i64,
2428        base_mask: u8,
2429        shift: usize,
2430    ) -> io::Result<u64> {
2431        let mask = base_mask << shift;
2432
2433        // load + store is OK without concurrent writers
2434        let full = bitset.load(Ordering::Relaxed);
2435        let old = (full & mask) >> shift;
2436        let new = if change > 0 {
2437            let change = change.try_into().map_err(|_| {
2438                io::Error::new(
2439                    io::ErrorKind::InvalidInput,
2440                    format!("Requested refcount change of {change} is too big for the image’s refcount width"),
2441                )
2442            })?;
2443            old.checked_add(change)
2444        } else {
2445            let change = (-change).try_into().map_err(|_| {
2446                io::Error::new(
2447                    io::ErrorKind::InvalidInput,
2448                    format!("Requested refcount change of {change} is too big for the image’s refcount width"),
2449                )
2450            })?;
2451            old.checked_sub(change)
2452        };
2453        let new = new.ok_or_else(|| {
2454            invalid_data(format!(
2455                "Changing refcount from {old} by {change} would overflow"
2456            ))
2457        })?;
2458        if new > base_mask {
2459            return Err(invalid_data(format!(
2460                "Changing refcount from {old} to {new} (by {change}) would overflow"
2461            )));
2462        }
2463
2464        let full = (full & !mask) | (new << shift);
2465        bitset.store(full, Ordering::Relaxed);
2466        Ok(old as u64)
2467    }
2468
2469    /// # Safety
2470    /// Caller must ensure there are no concurrent writers.
2471    unsafe fn fetch_update_full<
2472        T,
2473        L: FnOnce(&T) -> u64,
2474        S: FnOnce(&T, u64) -> Result<(), TryFromIntError>,
2475    >(
2476        atomic: &T,
2477        change: i64,
2478        load: L,
2479        store: S,
2480    ) -> io::Result<u64> {
2481        // load + store is OK without concurrent writers
2482        let old = load(atomic);
2483
2484        let new = if change > 0 {
2485            old.checked_add(change as u64)
2486        } else {
2487            old.checked_sub(-change as u64)
2488        };
2489        let new = new.ok_or_else(|| {
2490            invalid_data(format!(
2491                "Changing refcount from {old} by {change} would overflow"
2492            ))
2493        })?;
2494
2495        store(atomic, new).map_err(|_| {
2496            invalid_data(format!(
2497                "Changing refcount from {old} to {new} (by {change}) would overflow"
2498            ))
2499        })?;
2500
2501        Ok(old)
2502    }
2503
2504    /// Modify the given cluster’s refcount.
2505    fn modify(&mut self, index: usize, change: i64) -> io::Result<u64> {
2506        let result = match self.rb.refcount_order {
2507            // refcount_bits == 1, 2, 4
2508            0..=2 => {
2509                let (index, mask, shift) = self.rb.sub_byte_refcount_access(index);
2510                let raw_data_slice = unsafe { self.rb.raw_data.as_ref().into_typed_slice::<u8>() };
2511                let atomic =
2512                    unsafe { AtomicU8::from_ptr(&raw_data_slice[index] as *const u8 as *mut u8) };
2513                // Safe: `RefBlockWriteGuard` ensures there are no concurrent writers.
2514                unsafe { Self::fetch_update_bitset(atomic, change, mask, shift) }
2515            }
2516
2517            // refcount_bits == 8
2518            3 => {
2519                let raw_data_slice = unsafe { self.rb.raw_data.as_ref().into_typed_slice::<u8>() };
2520                let atomic =
2521                    unsafe { AtomicU8::from_ptr(&raw_data_slice[index] as *const u8 as *mut u8) };
2522                // Safe: `RefBlockWriteGuard` ensures there are no concurrent writers.
2523                unsafe {
2524                    Self::fetch_update_full(
2525                        atomic,
2526                        change,
2527                        |a| a.load(Ordering::Relaxed) as u64,
2528                        |a, v| {
2529                            a.store(v.try_into()?, Ordering::Relaxed);
2530                            Ok(())
2531                        },
2532                    )
2533                }
2534            }
2535
2536            // refcount_bits == 16
2537            4 => {
2538                let raw_data_slice = unsafe { self.rb.raw_data.as_ref().into_typed_slice::<u16>() };
2539                let atomic = unsafe {
2540                    AtomicU16::from_ptr(&raw_data_slice[index] as *const u16 as *mut u16)
2541                };
2542                unsafe {
2543                    Self::fetch_update_full(
2544                        atomic,
2545                        change,
2546                        |a| u16::from_be(a.load(Ordering::Relaxed)) as u64,
2547                        |a, v| {
2548                            a.store(u16::try_from(v)?.to_be(), Ordering::Relaxed);
2549                            Ok(())
2550                        },
2551                    )
2552                }
2553            }
2554
2555            // refcount_bits == 32
2556            5 => {
2557                let raw_data_slice = unsafe { self.rb.raw_data.as_ref().into_typed_slice::<u32>() };
2558                let atomic = unsafe {
2559                    AtomicU32::from_ptr(&raw_data_slice[index] as *const u32 as *mut u32)
2560                };
2561                unsafe {
2562                    Self::fetch_update_full(
2563                        atomic,
2564                        change,
2565                        |a| u32::from_be(a.load(Ordering::Relaxed)) as u64,
2566                        |a, v| {
2567                            a.store(u32::try_from(v)?.to_be(), Ordering::Relaxed);
2568                            Ok(())
2569                        },
2570                    )
2571                }
2572            }
2573
2574            // refcount_bits == 64
2575            6 => {
2576                let raw_data_slice = unsafe { self.rb.raw_data.as_ref().into_typed_slice::<u64>() };
2577                let atomic = unsafe {
2578                    AtomicU64::from_ptr(&raw_data_slice[index] as *const u64 as *mut u64)
2579                };
2580                unsafe {
2581                    Self::fetch_update_full(
2582                        atomic,
2583                        change,
2584                        |a| u64::from_be(a.load(Ordering::Relaxed)),
2585                        |a, v| {
2586                            a.store(v.to_be(), Ordering::Relaxed);
2587                            Ok(())
2588                        },
2589                    )
2590                }
2591            }
2592
2593            _ => unreachable!(),
2594        };
2595
2596        let result = result?;
2597        self.rb.modified.store(true, Ordering::Relaxed);
2598        Ok(result)
2599    }
2600
2601    /// Increment the given cluster’s refcount.
2602    ///
2603    /// Returns the old value.
2604    pub fn increment(&mut self, index: usize) -> io::Result<u64> {
2605        self.modify(index, 1)
2606    }
2607
2608    /// Decrement the given cluster’s refcount.
2609    ///
2610    /// Returns the old value.
2611    pub fn decrement(&mut self, index: usize) -> io::Result<u64> {
2612        self.modify(index, -1)
2613    }
2614
2615    /// Check whether the given cluster’s refcount is 0.
2616    pub fn is_zero(&self, index: usize) -> bool {
2617        self.rb.is_zero(index)
2618    }
2619}
2620
2621impl Drop for RefBlock {
2622    fn drop(&mut self) {
2623        if self.is_modified() {
2624            error!(
2625                "Refcount block dropped while modified; was the image closed before being flushed?"
2626            );
2627        }
2628    }
2629}
2630
2631/// Generic trait for qcow2 table entries (L1, L2, refcount table).
2632pub trait TableEntry
2633where
2634    Self: Sized,
2635{
2636    /// Load the given raw value, checking it for validity.
2637    fn try_from_plain(value: u64, header: &Header) -> io::Result<Self>;
2638
2639    /// Return the contained raw value.
2640    fn to_plain(&self) -> u64;
2641}
2642
2643/// Generic trait for qcow2 metadata tables (L1, L2, refcount table).
2644pub trait Table: Sized {
2645    /// Internal type for each table entry.
2646    type InternalEntry: TableEntry;
2647    /// Externally visible type for each table entry.
2648    type Entry: Copy;
2649    /// User-readable struct name.
2650    const NAME: &'static str;
2651    /// Maximum allowable number of entries.
2652    const MAX_ENTRIES: usize;
2653
2654    /// Create a new table with the given contents
2655    fn from_data(data: Box<[Self::InternalEntry]>, header: &Header) -> Self;
2656
2657    /// Number of entries.
2658    fn entries(&self) -> usize;
2659    /// Get the given entry (as reference).
2660    fn get_ref(&self, index: usize) -> Option<&Self::InternalEntry>;
2661    /// Get the given entry (copied).
2662    fn get(&self, index: usize) -> Self::Entry;
2663    /// Get this table’s (first) cluster in the image file.
2664    fn get_cluster(&self) -> Option<HostCluster>;
2665    /// Get this table’s offset in the image file.
2666    fn get_offset(&self) -> Option<HostOffset>;
2667    /// Set this table’s (first) cluster in the image file (for writing).
2668    fn set_cluster(&mut self, cluster: HostCluster);
2669    /// Remove the table’s association with any cluster in the image file.
2670    fn unset_cluster(&mut self);
2671
2672    /// Return log2 of the cluster size.
2673    ///
2674    /// All tables store this anyway.
2675    fn cluster_bits(&self) -> u32;
2676
2677    /// Check whether this table has been modified since it was last written.
2678    fn is_modified(&self) -> bool;
2679    /// Clear the modified flag.
2680    fn clear_modified(&self);
2681    /// Set the modified flag.
2682    fn set_modified(&self);
2683
2684    /// Table size in bytes.
2685    fn byte_size(&self) -> usize {
2686        self.entries() * size_of::<u64>()
2687    }
2688
2689    /// Number of clusters used by this table.
2690    fn cluster_count(&self) -> ClusterCount {
2691        ClusterCount::from_byte_size(self.byte_size() as u64, self.cluster_bits())
2692    }
2693
2694    /// Load a table from the image file.
2695    async fn load<S: Storage>(
2696        image: &S,
2697        header: &Header,
2698        cluster: HostCluster,
2699        entries: usize,
2700    ) -> io::Result<Self> {
2701        let offset = cluster.offset(header.cluster_bits());
2702
2703        check_table(
2704            Self::NAME,
2705            offset.0,
2706            entries,
2707            size_of::<u64>(),
2708            Self::MAX_ENTRIES,
2709            header.cluster_size(),
2710        )?;
2711
2712        let byte_size = entries * size_of::<u64>();
2713        let mut buffer = IoBuffer::new(byte_size, cmp::max(image.mem_align(), size_of::<u64>()))?;
2714
2715        image.read(&mut buffer, offset.0).await?;
2716
2717        // Safe because `u64` is a plain type, and the alignment fits
2718        let raw_table = unsafe { buffer.as_ref().into_typed_slice::<u64>() };
2719
2720        let mut table = Vec::<Self::InternalEntry>::with_capacity(entries);
2721        for be_value in raw_table {
2722            table.push(Self::InternalEntry::try_from_plain(
2723                u64::from_be(*be_value),
2724                header,
2725            )?)
2726        }
2727
2728        let mut table = Self::from_data(table.into_boxed_slice(), header);
2729        table.set_cluster(cluster);
2730        table.clear_modified();
2731        Ok(table)
2732    }
2733
2734    /// Write a table to the image file.
2735    ///
2736    /// Callers must ensure the table is copied, i.e. its refcount is 1.
2737    async fn write<S: Storage>(&self, image: &S) -> io::Result<()> {
2738        let offset = self
2739            .get_offset()
2740            .ok_or_else(|| io::Error::other("Cannot write qcow2 metadata table, no offset set"))?;
2741
2742        check_table(
2743            Self::NAME,
2744            offset.0,
2745            self.entries(),
2746            size_of::<u64>(),
2747            Self::MAX_ENTRIES,
2748            1 << self.cluster_bits(),
2749        )?;
2750
2751        let byte_size = self.byte_size();
2752        let mut buffer = IoBuffer::new(byte_size, cmp::max(image.mem_align(), size_of::<u64>()))?;
2753
2754        self.clear_modified();
2755
2756        // Safe because we have just allocated this, and it fits the alignment
2757        let raw_table = unsafe { buffer.as_mut().into_typed_slice::<u64>() };
2758        for (i, be_value) in raw_table.iter_mut().enumerate() {
2759            // 0 always works, that’s by design.
2760            *be_value = self.get_ref(i).map(|e| e.to_plain()).unwrap_or(0).to_be();
2761        }
2762
2763        if let Err(err) = image.write(&buffer, offset.0).await {
2764            self.set_modified();
2765            return Err(err);
2766        }
2767
2768        Ok(())
2769    }
2770
2771    /// Write at least the given single (modified) entry to the image file.
2772    ///
2773    /// Potentially writes more of the table, if alignment requirements ask for that.
2774    async fn write_entry<S: Storage>(&self, image: &S, index: usize) -> io::Result<()> {
2775        // This alignment calculation code implicitly assumes that the cluster size is aligned to
2776        // the storage’s request/memory alignment, but that is often fair.  If that is not the
2777        // case, there is not much we can do anyway.
2778        let byte_size = self.byte_size();
2779        let power_of_two_up_to_byte_size = ((byte_size / 2) + 1).next_power_of_two();
2780        let alignment = cmp::min(
2781            power_of_two_up_to_byte_size,
2782            cmp::max(
2783                cmp::max(image.mem_align(), image.req_align()),
2784                size_of::<u64>(),
2785            ),
2786        );
2787        let alignment_in_entries = alignment / size_of::<u64>();
2788
2789        let offset = self
2790            .get_offset()
2791            .ok_or_else(|| io::Error::other("Cannot write qcow2 metadata table, no offset set"))?;
2792
2793        check_table(
2794            Self::NAME,
2795            offset.0,
2796            self.entries(),
2797            size_of::<u64>(),
2798            Self::MAX_ENTRIES,
2799            1 << self.cluster_bits(),
2800        )?;
2801
2802        let mut buffer = IoBuffer::new(alignment, cmp::max(image.mem_align(), size_of::<u64>()))?;
2803
2804        // Safe because we have just allocated this, and it fits the alignment
2805        let raw_entries = unsafe { buffer.as_mut().into_typed_slice::<u64>() };
2806        let first_index = (index / alignment_in_entries) * alignment_in_entries;
2807        #[allow(clippy::needless_range_loop)]
2808        for i in 0..alignment_in_entries {
2809            // 0 always works, that’s by design.
2810            raw_entries[i] = self
2811                .get_ref(first_index + i)
2812                .map(|e| e.to_plain())
2813                .unwrap_or(0)
2814                .to_be();
2815        }
2816
2817        image
2818            .write(&buffer, offset.0 + (first_index * size_of::<u64>()) as u64)
2819            .await
2820    }
2821}
2822
2823/// Check whether the given table offset/size is valid.
2824///
2825/// Also works for refcount blocks (with cheating, because their entry size can be less than a
2826/// byte), which is why it is outside of [`Table`].
2827fn check_table(
2828    name: &str,
2829    offset: u64,
2830    entries: usize,
2831    entry_size: usize,
2832    max_entries: usize,
2833    cluster_size: usize,
2834) -> io::Result<()> {
2835    if entries > max_entries {
2836        return Err(invalid_data(format!(
2837            "{name} too big: {entries} > {max_entries}",
2838        )));
2839    }
2840
2841    if !offset.is_multiple_of(cluster_size as u64) {
2842        return Err(invalid_data(format!("{name}: Unaligned offset: {offset}")));
2843    }
2844
2845    let byte_size = entries
2846        .checked_mul(entry_size)
2847        .ok_or_else(|| invalid_data(format!("{name} size overflow: {entries} * {entry_size}")))?;
2848    let end_offset = offset
2849        .checked_add(byte_size as u64)
2850        .ok_or_else(|| invalid_data(format!("{name} offset overflow: {offset} + {byte_size}")))?;
2851    if end_offset > MAX_FILE_LENGTH {
2852        return Err(invalid_data(format!(
2853            "{name}: Invalid end offset: {end_offset} > {MAX_FILE_LENGTH}"
2854        )));
2855    }
2856
2857    Ok(())
2858}
2859
2860/// Helper function replacing `bincode::serialized_size()`.
2861///
2862/// This function has not yet been implemented in bincode 2.
2863fn encoded_size<E: Encode>(val: E) -> io::Result<usize> {
2864    let mut length = bincode::enc::write::SizeWriter::default();
2865    bincode::encode_into_writer(val, &mut length, BINCODE_CFG)
2866        .map_err(|err| invalid_data(err.to_string()))?;
2867    Ok(length.bytes_written)
2868}
2869
2870/// Helper function replacing `bincode::encode_to_vec()`.
2871///
2872/// Bincode provides an `encode_to_vec()` function, but only under the `alloc` feature.  For some
2873/// reason, enabling that feature pulls in a serde dependency, so re-implement it here.
2874fn encode_binary<E: Encode>(val: &E) -> io::Result<Vec<u8>> {
2875    let mut vec = vec![0; encoded_size(val)?];
2876    bincode::encode_into_slice(val, &mut vec, BINCODE_CFG)
2877        .map_err(|err| invalid_data(err.to_string()))?;
2878    Ok(vec)
2879}
2880
2881/// Helper function wrapping `bincode::decode_from_slice()`.
2882///
2883/// We already have [`encode_binary()`] as a helper, we might as well have a helper for decoding.
2884fn decode_binary<D: Decode<()>>(slice: &[u8]) -> io::Result<D> {
2885    bincode::decode_from_slice(slice, BINCODE_CFG)
2886        .map(|(result, _)| result)
2887        .map_err(|err| invalid_data(err.to_string()))
2888}