imago/qcow2/
metadata.rs

1//! Functionality for working with qcow2 metadata.
2
3use super::types::*;
4use crate::io_buffers::IoBuffer;
5use crate::macros::numerical_enum;
6use crate::misc_helpers::invalid_data;
7use crate::{Storage, StorageExt};
8use bincode::Options;
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11use std::mem::size_of;
12use std::num::TryFromIntError;
13use std::sync::atomic::{AtomicBool, AtomicU16, AtomicU32, AtomicU64, AtomicU8, Ordering};
14use std::{cmp, io};
15use tokio::sync::{Mutex, MutexGuard};
16use tracing::error;
17
18/// Qcow header magic ("QFI\xfb").
19pub(super) const MAGIC: u32 = 0x51_46_49_fb;
20
21/// Maximum file length.
22const MAX_FILE_LENGTH: u64 = 0x0100_0000_0000_0000u64;
23
24/// Maximum permissible host offset.
25pub(super) const MAX_OFFSET: HostOffset = HostOffset(MAX_FILE_LENGTH - 512);
26
27/// Minimum cluster size.
28///
29/// Defined by the specification.
30pub(super) const MIN_CLUSTER_SIZE: usize = 512;
31
32/// Maximum cluster size.
33///
34/// This is QEMU’s limit, so we can apply it, too.
35pub(super) const MAX_CLUSTER_SIZE: usize = 2 * 1024 * 1024;
36
37/// Minimum number of bits per refcount entry.
38pub(super) const MIN_REFCOUNT_WIDTH: usize = 1;
39
40/// Maximum number of bits per refcount entry.
41pub(super) const MAX_REFCOUNT_WIDTH: usize = 64;
42
43/// Qcow2 v2 header.
44#[derive(Deserialize, Serialize)]
45struct V2Header {
46    /// Qcow magic string ("QFI\xfb").
47    magic: u32,
48
49    /// Version number (valid values are 2 and 3).
50    version: u32,
51
52    /// Offset into the image file at which the backing file name is stored (NB: The string is not
53    /// null terminated).  0 if the image doesn’t have a backing file.
54    ///
55    /// Note: backing files are incompatible with raw external data files (auto-clear feature bit
56    /// 1).
57    backing_file_offset: u64,
58
59    /// Length of the backing file name in bytes.  Must not be longer than 1023 bytes.  Undefined
60    /// if the image doesn’t have a backing file.
61    backing_file_size: u32,
62
63    /// Number of bits that are used for addressing an offset within a cluster (`1 << cluster_bits`
64    /// is the cluster size).  Must not be less than 9 (i.e. 512 byte clusters).
65    ///
66    /// Note: qemu as of today has an implementation limit of 2 MB as the maximum cluster size and
67    /// won’t be able to open images with larger cluster sizes.
68    ///
69    /// Note: if the image has Extended L2 Entries then `cluster_bits` must be at least 14 (i.e.
70    /// 16384 byte clusters).
71    cluster_bits: u32,
72
73    /// Virtual disk size in bytes.
74    ///
75    /// Note: qemu has an implementation limit of 32 MB as the maximum L1 table size.  With a 2 MB
76    /// cluster size, it is unable to populate a virtual cluster beyond 2 EB (61 bits); with a 512
77    /// byte cluster size, it is unable to populate a virtual size larger than 128 GB (37 bits).
78    /// Meanwhile, L1/L2 table layouts limit an image to no more than 64 PB (56 bits) of populated
79    /// clusters, and an image may hit other limits first (such as a file system’s maximum size).
80    size: AtomicU64,
81
82    /// Encryption method:
83    ///
84    /// 0. no encryption
85    /// 1. AES encryption
86    /// 2. LUKS encryption
87    crypt_method: u32,
88
89    /// Number of entries in the active L1 table.
90    l1_size: AtomicU32,
91
92    /// Offset into the image file at which the active L1 table starts.  Must be aligned to a
93    /// cluster boundary.
94    l1_table_offset: AtomicU64,
95
96    /// Offset into the image file at which the refcount table starts.  Must be aligned to a
97    /// cluster boundary.
98    refcount_table_offset: AtomicU64,
99
100    /// Number of clusters that the refcount table occupies.
101    refcount_table_clusters: AtomicU32,
102
103    /// Number of snapshots contained in the image.
104    nb_snapshots: u32,
105
106    /// Offset into the image file at which the snapshot table starts.  Must be aligned to a
107    /// cluster boundary.
108    snapshots_offset: u64,
109}
110
111impl V2Header {
112    /// Raw v2 header length.
113    const RAW_SIZE: usize = 72;
114}
115
116/// Qcow2 v3 header.
117#[derive(Deserialize, Serialize)]
118struct V3HeaderBase {
119    /// Bitmask of incompatible features.  An implementation must fail to open an image if an
120    /// unknown bit is set.
121    ///
122    /// 0. Dirty bit.  If this bit is set then refcounts may be inconsistent, make sure to scan
123    ///    L1/L2 tables to repair refcounts before accessing the image.
124    /// 1. Corrupt bit.  If this bit is set then any data structure may be corrupt and the image
125    ///    must not be written to (unless for regaining consistency).
126    /// 2. External data file bit.  If this bit is set, an external data file is used.  Guest
127    ///    clusters are then stored in the external data file.  For such images, clusters in the
128    ///    external data file are not refcounted.  The offset field in the Standard Cluster
129    ///    Descriptor must match the guest offset and neither compressed clusters nor internal
130    ///    snapshots are supported.  An External Data File Name header extension may be present if
131    ///    this bit is set.
132    /// 3. Compression type bit.  If this bit is set, a non-default compression is used for
133    ///    compressed clusters.  The compression_type field must be present and not zero.
134    /// 4. Extended L2 Entries.  If this bit is set then L2 table entries use an extended format
135    ///    that allows subcluster-based allocation.  See the Extended L2 Entries section for more
136    ///    details.
137    ///
138    /// Bits 5-63 are reserved (set to 0).
139    incompatible_features: u64,
140
141    /// Bitmask of compatible features.  An implementation can safely ignore any unknown bits that
142    /// are set.
143    ///
144    /// 0. Lazy refcounts bit.  If this bit is set then lazy refcount updates can be used.  This
145    ///    means marking the image file dirty and postponing refcount metadata updates.
146    ///
147    /// Bits 1-63 are reserved (set to 0).
148    compatible_features: u64,
149
150    /// Bitmask of auto-clear features.  An implementation may only write to an image with unknown
151    /// auto-clear features if it clears the respective bits from this field first.
152    ///
153    /// 0. Bitmaps extension bit.  This bit indicates consistency for the bitmaps extension data.
154    ///    It is an error if this bit is set without the bitmaps extension present.  If the bitmaps
155    ///    extension is present but this bit is unset, the bitmaps extension data must be
156    ///    considered inconsistent.
157    /// 1. Raw external data bit.  If this bit is set, the external data file can be read as a
158    ///    consistent standalone raw image without looking at the qcow2 metadata.  Setting this bit
159    ///    has a performance impact for some operations on the image (e.g. writing zeros requires
160    ///    writing to the data file instead of only setting the zero flag in the L2 table entry)
161    ///    and conflicts with backing files.  This bit may only be set if the External Data File
162    ///    bit (incompatible feature bit 1) is also set.
163    ///
164    /// Bits 2-63 are reserved (set to 0).
165    autoclear_features: u64,
166
167    /// Describes the width of a reference count block entry (width in bits: `refcount_bits = 1 <<
168    /// refcount_order`).  For version 2 images, the order is always assumed to be 4 (i.e.
169    /// `refcount_bits = 16`).  This value may not exceed 6 (i.e. `refcount_bits = 64`).
170    refcount_order: u32,
171
172    /// Length of the header structure in bytes.  For version 2 images, the length is always
173    /// assumed to be 72 bytes.  For version 3 it’s at least 104 bytes and must be a multiple of 8.
174    header_length: u32,
175}
176
177impl V3HeaderBase {
178    /// Raw v3 header length beyond the v2 header.
179    const RAW_SIZE: usize = 104 - V2Header::RAW_SIZE;
180}
181
182impl Default for V3HeaderBase {
183    fn default() -> Self {
184        V3HeaderBase {
185            incompatible_features: 0,
186            compatible_features: 0,
187            autoclear_features: 0,
188            refcount_order: 4,
189            header_length: (V2Header::RAW_SIZE + V3HeaderBase::RAW_SIZE) as u32,
190        }
191    }
192}
193
194numerical_enum! {
195    /// Incompatible feature bits.
196    pub(super) enum IncompatibleFeatures as u64 {
197        Dirty = 1 << 0,
198        Corrupt = 1 << 1,
199        ExternalDataFile = 1 << 2,
200        CompressionType = 1 << 3,
201        ExtendedL2Entries = 1 << 4,
202    }
203}
204
205impl From<IncompatibleFeatures> for (FeatureType, u8) {
206    /// Get this feature’s feature name table key.
207    fn from(feat: IncompatibleFeatures) -> (FeatureType, u8) {
208        assert!((feat as u64).is_power_of_two());
209        (
210            FeatureType::Incompatible,
211            (feat as u64).trailing_zeros() as u8,
212        )
213    }
214}
215
216numerical_enum! {
217    /// Compatible feature bits.
218    pub(super) enum CompatibleFeatures as u64 {
219        LazyRefcounts = 1 << 0,
220    }
221}
222
223impl From<CompatibleFeatures> for (FeatureType, u8) {
224    /// Get this feature’s feature name table key.
225    fn from(feat: CompatibleFeatures) -> (FeatureType, u8) {
226        assert!((feat as u64).is_power_of_two());
227        (
228            FeatureType::Compatible,
229            (feat as u64).trailing_zeros() as u8,
230        )
231    }
232}
233
234numerical_enum! {
235    /// Autoclear feature bits.
236    pub(super) enum AutoclearFeatures as u64 {
237        Bitmaps = 1 << 0,
238        RawExternalData = 1 << 1,
239    }
240}
241
242impl From<AutoclearFeatures> for (FeatureType, u8) {
243    /// Get this feature’s feature name table key.
244    fn from(feat: AutoclearFeatures) -> (FeatureType, u8) {
245        assert!((feat as u64).is_power_of_two());
246        (FeatureType::Autoclear, (feat as u64).trailing_zeros() as u8)
247    }
248}
249
250numerical_enum! {
251    /// Extension type IDs.
252    pub(super) enum HeaderExtensionType as u32 {
253        /// End of extension list.
254        End = 0,
255
256        /// Backing file format string.
257        BackingFileFormat = 0xe2792aca,
258
259        /// Map of feature bits to human-readable names.
260        FeatureNameTable = 0x6803f857,
261
262        /// External data file filename string.
263        ExternalDataFileName = 0x44415441,
264    }
265}
266
267/// Header for a header extension.
268#[derive(Default, Deserialize, Serialize)]
269struct HeaderExtensionHeader {
270    /// Type code of the header extension.
271    extension_type: u32,
272
273    /// Data length.
274    length: u32,
275}
276
277impl HeaderExtensionHeader {
278    /// Raw struct length.
279    const RAW_SIZE: usize = 8;
280}
281
282numerical_enum! {
283    /// Feature type ID for the feature name table.
284    #[derive(Hash)]
285    pub(super) enum FeatureType as u8 {
286        Incompatible = 0,
287        Compatible = 1,
288        Autoclear = 2,
289    }
290}
291
292/// Header extensions (high-level representation).
293#[derive(Debug, Clone, Eq, PartialEq)]
294pub(super) enum HeaderExtension {
295    /// Backing file format string.
296    BackingFileFormat(String),
297
298    /// Map of feature bits to human-readable names.
299    FeatureNameTable(HashMap<(FeatureType, u8), String>),
300
301    /// External data file filename string.
302    ExternalDataFileName(String),
303
304    /// Unknown extension.
305    Unknown {
306        /// Type.
307        extension_type: u32,
308        /// Data (as read).
309        data: Vec<u8>,
310    },
311}
312
313/// Integrated header representation.
314pub(super) struct Header {
315    /// v2 part of the header.
316    v2: V2Header,
317
318    /// Base v3 part of the header.
319    v3: V3HeaderBase,
320
321    /// Unrecognized header fields.
322    unknown_header_fields: Vec<u8>,
323
324    /// Backing filename string.
325    backing_filename: Option<String>,
326
327    /// Extensions.
328    extensions: Vec<HeaderExtension>,
329
330    /// Whether an external data file is required.
331    external_data_file: bool,
332}
333
334impl Header {
335    /// Load the qcow2 header from disk.
336    ///
337    /// If `writable` is false, do not perform any modifications (e.g. clearing auto-clear bits).
338    pub async fn load<S: Storage>(image: &S, writable: bool) -> io::Result<Self> {
339        // TODO: More sanity checks.
340        let bincode = bincode::DefaultOptions::new()
341            .with_fixint_encoding()
342            .with_big_endian();
343
344        let mut header_buf = vec![0u8; V2Header::RAW_SIZE];
345        image.read(header_buf.as_mut_slice(), 0).await?;
346
347        let header: V2Header = bincode.deserialize(&header_buf).map_err(invalid_data)?;
348        if header.magic != MAGIC {
349            return Err(invalid_data("Not a qcow2 file"));
350        }
351
352        let v3header_base = if header.version == 2 {
353            V3HeaderBase::default()
354        } else if header.version == 3 {
355            let mut header_buf = vec![0u8; V3HeaderBase::RAW_SIZE];
356            image
357                .read(header_buf.as_mut_slice(), V2Header::RAW_SIZE as u64)
358                .await?;
359            bincode.deserialize(&header_buf).map_err(invalid_data)?
360        } else {
361            return Err(invalid_data(format!(
362                "qcow2 v{} is not supported",
363                header.version
364            )));
365        };
366
367        let cluster_size = 1usize.checked_shl(header.cluster_bits).ok_or_else(|| {
368            invalid_data(format!("Invalid cluster size: 2^{}", header.cluster_bits))
369        })?;
370        if !(MIN_CLUSTER_SIZE..=MAX_CLUSTER_SIZE).contains(&cluster_size) {
371            return Err(invalid_data(format!(
372                "Invalid cluster size: {cluster_size}; must be between {MIN_CLUSTER_SIZE} and {MAX_CLUSTER_SIZE}",
373            )));
374        }
375
376        let min_header_size = V2Header::RAW_SIZE + V3HeaderBase::RAW_SIZE;
377        if (v3header_base.header_length as usize) < min_header_size {
378            return Err(invalid_data(format!(
379                "qcow2 header too short: {} < {min_header_size}",
380                v3header_base.header_length,
381            )));
382        } else if (v3header_base.header_length as usize) > cluster_size {
383            return Err(invalid_data(format!(
384                "qcow2 header too big: {} > {cluster_size}",
385                v3header_base.header_length,
386            )));
387        }
388
389        let unknown_header_fields = if header.version == 2 {
390            Vec::new()
391        } else {
392            let mut unknown_header_fields =
393                vec![0u8; v3header_base.header_length as usize - min_header_size];
394            image
395                .read(&mut unknown_header_fields, min_header_size as u64)
396                .await?;
397            unknown_header_fields
398        };
399
400        let l1_offset = HostOffset(header.l1_table_offset.load(Ordering::Relaxed));
401        l1_offset
402            .checked_cluster(header.cluster_bits)
403            .ok_or_else(|| invalid_data(format!("Unaligned L1 table: {l1_offset}")))?;
404
405        let rt_offset = HostOffset(header.refcount_table_offset.load(Ordering::Relaxed));
406        rt_offset
407            .checked_cluster(header.cluster_bits)
408            .ok_or_else(|| invalid_data(format!("Unaligned refcount table: {rt_offset}")))?;
409
410        let rc_width = 1usize
411            .checked_shl(v3header_base.refcount_order)
412            .ok_or_else(|| {
413                invalid_data(format!(
414                    "Invalid refcount width: 2^{}",
415                    v3header_base.refcount_order
416                ))
417            })?;
418        if !(MIN_REFCOUNT_WIDTH..=MAX_REFCOUNT_WIDTH).contains(&rc_width) {
419            return Err(invalid_data(format!(
420                "Invalid refcount width: {rc_width}; must be between {MIN_REFCOUNT_WIDTH} and {MAX_REFCOUNT_WIDTH}",
421            )));
422        }
423
424        let backing_filename = if header.backing_file_offset != 0 {
425            let (offset, length) = (header.backing_file_offset, header.backing_file_size);
426            if length > 1023 {
427                return Err(invalid_data(format!(
428                    "Backing file name is too long ({length}, must not exceed 1023)"
429                )));
430            }
431
432            let end = offset.checked_add(length as u64).ok_or(invalid_data(
433                "Backing file name offset is invalid (too high)",
434            ))?;
435            if end >= cluster_size as u64 {
436                return Err(invalid_data(
437                    "Backing file name offset is invalid (beyond first cluster)",
438                ));
439            }
440
441            let mut backing_buf = vec![0; length as usize];
442            image.read(&mut backing_buf, offset).await?;
443
444            Some(
445                String::from_utf8(backing_buf)
446                    .map_err(|err| invalid_data(format!("Backing file name is invalid: {err}")))?,
447            )
448        } else {
449            None
450        };
451
452        let extensions = if header.version == 2 {
453            Vec::new()
454        } else {
455            let mut ext_offset: u64 = v3header_base.header_length as u64;
456            let mut extensions = Vec::<HeaderExtension>::new();
457            loop {
458                if ext_offset + HeaderExtensionHeader::RAW_SIZE as u64 > cluster_size as u64 {
459                    return Err(invalid_data("Header extensions exceed the first cluster"));
460                }
461
462                let mut ext_hdr_buf = vec![0; HeaderExtensionHeader::RAW_SIZE];
463                image.read(&mut ext_hdr_buf, ext_offset).await?;
464
465                ext_offset += HeaderExtensionHeader::RAW_SIZE as u64;
466
467                let ext_hdr: HeaderExtensionHeader =
468                    bincode.deserialize(&ext_hdr_buf).map_err(invalid_data)?;
469                let ext_end = ext_offset
470                    .checked_add(ext_hdr.length as u64)
471                    .ok_or_else(|| invalid_data("Header size overflow"))?;
472                if ext_end > cluster_size as u64 {
473                    return Err(invalid_data("Header extensions exceed the first cluster"));
474                }
475
476                let mut ext_data = vec![0; ext_hdr.length as usize];
477                image.read(&mut ext_data, ext_offset).await?;
478
479                ext_offset += (ext_hdr.length as u64).next_multiple_of(8);
480
481                let Some(extension) =
482                    HeaderExtension::deserialize(ext_hdr.extension_type, ext_data)?
483                else {
484                    break;
485                };
486
487                extensions.push(extension);
488            }
489            extensions
490        };
491
492        // Check for header extension conflicts
493        let backing_fmt = extensions
494            .iter()
495            .find(|ext| matches!(ext, HeaderExtension::BackingFileFormat(_)));
496        if let Some(backing_fmt) = backing_fmt {
497            let conflicting = extensions.iter().find(|ext| {
498                matches!(ext, HeaderExtension::BackingFileFormat(_)) && ext != &backing_fmt
499            });
500            if let Some(conflicting) = conflicting {
501                return Err(io::Error::other(format!(
502                    "Found conflicting backing file formats: {backing_fmt:?} != {conflicting:?}",
503                )));
504            }
505        }
506        let ext_data_file = extensions
507            .iter()
508            .find(|ext| matches!(ext, HeaderExtension::ExternalDataFileName(_)));
509        if let Some(ext_data_file) = ext_data_file {
510            let conflicting = extensions.iter().find(|ext| {
511                matches!(ext, HeaderExtension::ExternalDataFileName(_)) && ext != &ext_data_file
512            });
513            if let Some(conflicting) = conflicting {
514                return Err(io::Error::other(format!(
515                    "Found conflicting external data file names: {ext_data_file:?} != {conflicting:?}",
516                )));
517            }
518        }
519
520        let mut incompatible_features = v3header_base.incompatible_features;
521        let autoclear_features = v3header_base.autoclear_features;
522
523        let external_data_file =
524            incompatible_features & IncompatibleFeatures::ExternalDataFile as u64 != 0;
525        incompatible_features &= !(IncompatibleFeatures::ExternalDataFile as u64);
526
527        let mut header = Header {
528            v2: header,
529            v3: v3header_base,
530            unknown_header_fields,
531            backing_filename,
532            extensions,
533            external_data_file,
534        };
535
536        // No need to clear autoclear features for read-only images
537        if autoclear_features != 0 && writable {
538            header.v3.autoclear_features = 0;
539            header.write(image).await?;
540        }
541
542        if incompatible_features != 0 {
543            let feats = (0..64)
544                .filter(|bit| header.v3.incompatible_features & (1u64 << bit) != 0)
545                .map(|bit| {
546                    if let Some(name) = header.feature_name(FeatureType::Incompatible, bit) {
547                        format!("{bit} ({name})")
548                    } else {
549                        format!("{bit}")
550                    }
551                })
552                .collect::<Vec<String>>();
553
554            return Err(invalid_data(format!(
555                "Unrecognized incompatible feature(s) {}",
556                feats.join(", ")
557            )));
558        }
559
560        Ok(header)
561    }
562
563    /// Write the qcow2 header to disk.
564    pub async fn write<S: Storage>(&mut self, image: &S) -> io::Result<()> {
565        let bincode = bincode::DefaultOptions::new()
566            .with_fixint_encoding()
567            .with_big_endian();
568
569        let header_len = if self.v2.version > 2 {
570            let len = bincode.serialized_size(&self.v2).unwrap() as usize
571                + bincode.serialized_size(&self.v3).unwrap() as usize
572                + self.unknown_header_fields.len();
573            let len = len.next_multiple_of(8);
574            self.v3.header_length = len as u32;
575            len
576        } else {
577            V2Header::RAW_SIZE
578        };
579
580        // If the header gets too long, try to remove the feature name table to make it small
581        // enough
582        let mut header_exts;
583        let mut backing_file_ofs;
584        loop {
585            header_exts = self.serialize_extensions()?;
586
587            backing_file_ofs = header_len
588                .checked_add(header_exts.len())
589                .ok_or_else(|| invalid_data("Header size overflow"))?;
590            let backing_file_len = self
591                .backing_filename
592                .as_ref()
593                .map(|n| n.len()) // length in bytes
594                .unwrap_or(0);
595            let header_end = backing_file_ofs
596                .checked_add(backing_file_len)
597                .ok_or_else(|| invalid_data("Header size overflow"))?;
598
599            if header_end <= self.cluster_size() {
600                break;
601            }
602
603            if !self
604                .extensions
605                .iter()
606                .any(|e| e.extension_type() == HeaderExtensionType::FeatureNameTable as u32)
607            {
608                return Err(io::Error::other(format!(
609                    "Header would be too long ({header_end} > {})",
610                    self.cluster_size()
611                )));
612            }
613            self.extensions
614                .retain(|e| e.extension_type() != HeaderExtensionType::FeatureNameTable as u32);
615        }
616
617        if let Some(backing) = self.backing_filename.as_ref() {
618            self.v2.backing_file_offset = backing_file_ofs as u64;
619            self.v2.backing_file_size = backing.len() as u32; // length in bytes
620        } else {
621            self.v2.backing_file_offset = 0;
622            self.v2.backing_file_size = 0;
623        };
624
625        let mut full_buf = bincode.serialize(&self.v2).map_err(invalid_data)?;
626        if self.v2.version > 2 {
627            full_buf.append(&mut bincode.serialize(&self.v3).map_err(invalid_data)?);
628            full_buf.extend_from_slice(&self.unknown_header_fields);
629            full_buf.resize(full_buf.len().next_multiple_of(8), 0);
630        }
631
632        full_buf.append(&mut header_exts);
633
634        if let Some(backing) = self.backing_filename.as_ref() {
635            full_buf.extend_from_slice(backing.as_bytes());
636        }
637
638        if full_buf.len() > self.cluster_size() {
639            return Err(io::Error::other(format!(
640                "Header is too big to write ({}, larger than a cluster ({}))",
641                full_buf.len(),
642                self.cluster_size(),
643            )));
644        }
645
646        image.write(&full_buf, 0).await
647    }
648
649    /// Create a header for a new image.
650    pub fn new(
651        cluster_bits: u32,
652        refcount_order: u32,
653        backing_filename: Option<String>,
654        backing_format: Option<String>,
655        external_data_file: Option<String>,
656    ) -> Self {
657        assert!((MIN_CLUSTER_SIZE..=MAX_CLUSTER_SIZE)
658            .contains(&1usize.checked_shl(cluster_bits).unwrap()));
659        assert!((MIN_REFCOUNT_WIDTH..=MAX_REFCOUNT_WIDTH)
660            .contains(&1usize.checked_shl(refcount_order).unwrap()));
661
662        let has_external_data_file = external_data_file.is_some();
663        let incompatible_features = if has_external_data_file {
664            IncompatibleFeatures::ExternalDataFile as u64
665        } else {
666            0
667        };
668
669        let mut extensions = vec![HeaderExtension::feature_name_table()];
670        if let Some(backing_format) = backing_format {
671            extensions.push(HeaderExtension::BackingFileFormat(backing_format));
672        }
673        if let Some(external_data_file) = external_data_file {
674            extensions.push(HeaderExtension::ExternalDataFileName(external_data_file));
675        }
676
677        Header {
678            v2: V2Header {
679                magic: MAGIC,
680                version: 3,
681                backing_file_offset: 0, // will be set by `Self::write()`
682                backing_file_size: 0,   // will be set by `Self::write()`
683                cluster_bits,
684                size: 0.into(),
685                crypt_method: 0,
686                l1_size: 0.into(),
687                l1_table_offset: 0.into(),
688                refcount_table_offset: 0.into(),
689                refcount_table_clusters: 0.into(),
690                nb_snapshots: 0,
691                snapshots_offset: 0,
692            },
693            v3: V3HeaderBase {
694                incompatible_features,
695                compatible_features: 0,
696                autoclear_features: 0,
697                refcount_order,
698                header_length: 0, // will be set by `Self::write()`
699            },
700            unknown_header_fields: Vec::new(),
701            backing_filename,
702            extensions,
703            external_data_file: has_external_data_file,
704        }
705    }
706
707    /// Update from a newly loaded header.
708    ///
709    /// Checks whether fields we consider immutable have remained the same, and updates mutable
710    /// fields.
711    pub fn update(&self, new_header: &Header) -> io::Result<()> {
712        /// Verify that the given field matches in `self` and `new_header`.
713        macro_rules! check_field {
714            ($($field:ident).*) => {
715                (self.$($field).* == new_header.$($field).*).then_some(()).ok_or_else(|| {
716                    io::Error::other(format!(
717                        "Incompatible header modification on {}: {} != {}",
718                        stringify!($($field).*),
719                        self.$($field).*,
720                        new_header.$($field).*
721                    ))
722                })
723            };
724        }
725
726        check_field!(v2.magic)?;
727        check_field!(v2.version)?;
728        check_field!(v2.backing_file_offset)?; // TODO: Should be mutable
729        check_field!(v2.backing_file_size)?; // TODO: Should be mutable
730        check_field!(v2.cluster_bits)?;
731        // Size is mutable
732        // L1 position is mutable
733        // Reftable position is mutable
734        check_field!(v2.crypt_method)?;
735        check_field!(v2.nb_snapshots)?; // TODO: Should be mutable
736        check_field!(v2.snapshots_offset)?; // TODO: Should be mutable
737        check_field!(v3.incompatible_features)?; // TODO: Should be mutable
738        check_field!(v3.compatible_features)?; // TODO: Should be mutable
739        check_field!(v3.autoclear_features)?; // TODO: Should be mutable
740        check_field!(v3.refcount_order)?;
741        // header length is OK to ignore (as long as it’s valid)
742
743        // TODO: Should be mutable
744        (self.unknown_header_fields == new_header.unknown_header_fields)
745            .then_some(())
746            .ok_or_else(|| io::Error::other("Unknown header fields modified"))?;
747        // TODO: Should be mutable
748        (self.backing_filename == new_header.backing_filename)
749            .then_some(())
750            .ok_or_else(|| io::Error::other("Backing filename modified"))?;
751        // TODO: Should be mutable
752        (self.extensions == new_header.extensions)
753            .then_some(())
754            .ok_or_else(|| io::Error::other("Header extensions modified"))?;
755
756        check_field!(external_data_file)?;
757
758        self.v2.size.store(
759            new_header.v2.size.load(Ordering::Relaxed),
760            Ordering::Relaxed,
761        );
762
763        self.v2.l1_table_offset.store(
764            new_header.v2.l1_table_offset.load(Ordering::Relaxed),
765            Ordering::Relaxed,
766        );
767        self.v2.l1_size.store(
768            new_header.v2.l1_size.load(Ordering::Relaxed),
769            Ordering::Relaxed,
770        );
771        self.v2.refcount_table_offset.store(
772            new_header.v2.refcount_table_offset.load(Ordering::Relaxed),
773            Ordering::Relaxed,
774        );
775        self.v2.refcount_table_clusters.store(
776            new_header
777                .v2
778                .refcount_table_clusters
779                .load(Ordering::Relaxed),
780            Ordering::Relaxed,
781        );
782
783        Ok(())
784    }
785
786    /// Guest disk size.
787    pub fn size(&self) -> u64 {
788        self.v2.size.load(Ordering::Relaxed)
789    }
790
791    /// Require a minimum qcow2 version.
792    ///
793    /// Return an error if the version requirement is not met.
794    pub fn require_version(&self, minimum: u32) -> io::Result<()> {
795        let version = self.v2.version;
796        if version >= minimum {
797            Ok(())
798        } else {
799            Err(io::Error::new(
800                io::ErrorKind::Unsupported,
801                format!("qcow2 version {minimum} required, image has version {version}"),
802            ))
803        }
804    }
805
806    /// Set the guest disk size.
807    pub fn set_size(&self, new_size: u64) {
808        self.v2.size.store(new_size, Ordering::Relaxed)
809    }
810
811    /// log2 of the cluster size.
812    pub fn cluster_bits(&self) -> u32 {
813        self.v2.cluster_bits
814    }
815
816    /// Cluster size in bytes.
817    pub fn cluster_size(&self) -> usize {
818        1 << self.cluster_bits()
819    }
820
821    /// Number of entries per L2 table.
822    pub fn l2_entries(&self) -> usize {
823        // 3 == log2(size_of::<u64>())
824        1 << (self.cluster_bits() - 3)
825    }
826
827    /// log2 of the number of entries per refcount block.
828    pub fn rb_bits(&self) -> u32 {
829        // log2(cluster_size >> (refcount_order - 3))
830        self.cluster_bits() - (self.refcount_order() - 3)
831    }
832
833    /// Number of entries per refcount block.
834    pub fn rb_entries(&self) -> usize {
835        1 << self.rb_bits()
836    }
837
838    /// log2 of the refcount bits.
839    pub fn refcount_order(&self) -> u32 {
840        self.v3.refcount_order
841    }
842
843    /// Offset of the L1 table.
844    pub fn l1_table_offset(&self) -> HostOffset {
845        HostOffset(self.v2.l1_table_offset.load(Ordering::Relaxed))
846    }
847
848    /// Number of entries in the L1 table.
849    pub fn l1_table_entries(&self) -> usize {
850        self.v2.l1_size.load(Ordering::Relaxed) as usize
851    }
852
853    /// Enter a new L1 table in the image header.
854    pub fn set_l1_table(&self, l1_table: &L1Table) -> io::Result<()> {
855        let offset = l1_table.get_offset().ok_or_else(|| {
856            io::Error::new(
857                io::ErrorKind::InvalidInput,
858                "New L1 table has no assigned offset",
859            )
860        })?;
861
862        let entries = l1_table.entries();
863        let entries = entries
864            .try_into()
865            .map_err(|err| invalid_data(format!("Too many L1 entries ({entries}): {err}")))?;
866
867        self.v2.l1_table_offset.store(offset.0, Ordering::Relaxed);
868
869        self.v2.l1_size.store(entries, Ordering::Relaxed);
870
871        Ok(())
872    }
873
874    /// Offset of the refcount table.
875    pub fn reftable_offset(&self) -> HostOffset {
876        HostOffset(self.v2.refcount_table_offset.load(Ordering::Relaxed))
877    }
878
879    /// Number of clusters occupied by the refcount table.
880    pub fn reftable_clusters(&self) -> ClusterCount {
881        ClusterCount(self.v2.refcount_table_clusters.load(Ordering::Relaxed) as u64)
882    }
883
884    /// Number of entries in the refcount table.
885    pub fn reftable_entries(&self) -> usize {
886        // 3 == log2(size_of::<u64>())
887        (self.reftable_clusters().byte_size(self.cluster_bits()) >> 3) as usize
888    }
889
890    /// Enter a new refcount table in the image header.
891    pub fn set_reftable(&self, reftable: &RefTable) -> io::Result<()> {
892        let offset = reftable.get_offset().ok_or_else(|| {
893            io::Error::new(
894                io::ErrorKind::InvalidInput,
895                "New refcount table has no assigned offset",
896            )
897        })?;
898
899        let clusters = reftable.cluster_count();
900        let clusters = clusters.0.try_into().map_err(|err| {
901            invalid_data(format!("Too many reftable clusters ({clusters}): {err}"))
902        })?;
903
904        self.v2
905            .refcount_table_clusters
906            .store(clusters, Ordering::Relaxed);
907
908        self.v2
909            .refcount_table_offset
910            .store(offset.0, Ordering::Relaxed);
911
912        Ok(())
913    }
914
915    /// Backing filename from the image header (if any).
916    pub fn backing_filename(&self) -> Option<&String> {
917        self.backing_filename.as_ref()
918    }
919
920    /// Backing format string from the image header (if any).
921    pub fn backing_format(&self) -> Option<&String> {
922        self.extensions.iter().find_map(|e| match e {
923            HeaderExtension::BackingFileFormat(fmt) => Some(fmt),
924            _ => None,
925        })
926    }
927
928    /// Whether this image requires an external data file.
929    pub fn external_data_file(&self) -> bool {
930        self.external_data_file
931    }
932
933    /// External data file filename from the image header (if any).
934    pub fn external_data_filename(&self) -> Option<&String> {
935        self.extensions.iter().find_map(|e| match e {
936            HeaderExtension::ExternalDataFileName(filename) => Some(filename),
937            _ => None,
938        })
939    }
940
941    /// Translate a feature bit to a human-readable name.
942    ///
943    /// Uses the feature name table from the image header, if present.
944    pub fn feature_name(&self, feat_type: FeatureType, bit: u32) -> Option<&String> {
945        for e in &self.extensions {
946            if let HeaderExtension::FeatureNameTable(names) = e {
947                if let Some(name) = names.get(&(feat_type, bit as u8)) {
948                    return Some(name);
949                }
950            }
951        }
952
953        None
954    }
955
956    /// Serialize all header extensions.
957    fn serialize_extensions(&self) -> io::Result<Vec<u8>> {
958        let bincode = bincode::DefaultOptions::new()
959            .with_fixint_encoding()
960            .with_big_endian();
961
962        let mut result = Vec::new();
963        for e in &self.extensions {
964            let mut data = e.serialize_data()?;
965            let ext_hdr = HeaderExtensionHeader {
966                extension_type: e.extension_type(),
967                length: data.len().try_into().map_err(|err| {
968                    invalid_data(format!("Header extension too long ({}): {err}", data.len()))
969                })?,
970            };
971            result.append(&mut bincode.serialize(&ext_hdr).map_err(invalid_data)?);
972            result.append(&mut data);
973            result.resize(result.len().next_multiple_of(8), 0);
974        }
975
976        let end_ext = HeaderExtensionHeader {
977            extension_type: HeaderExtensionType::End as u32,
978            length: 0,
979        };
980        result.append(&mut bincode.serialize(&end_ext).map_err(invalid_data)?);
981        result.resize(result.len().next_multiple_of(8), 0);
982
983        Ok(result)
984    }
985
986    /// Helper for functions that just need to change little bits in the v2 header part.
987    async fn write_v2_header<S: Storage>(&self, image: &S) -> io::Result<()> {
988        let bincode = bincode::DefaultOptions::new()
989            .with_fixint_encoding()
990            .with_big_endian();
991
992        let v2_header = bincode.serialize(&self.v2).map_err(invalid_data)?;
993        image.write(&v2_header, 0).await
994    }
995
996    /// Write the refcount table pointer (offset and size) to disk.
997    pub async fn write_reftable_pointer<S: Storage>(&self, image: &S) -> io::Result<()> {
998        // TODO: Just write the reftable offset and size
999        self.write_v2_header(image).await
1000    }
1001
1002    /// Write the L1 table pointer (offset and size) to disk.
1003    pub async fn write_l1_table_pointer<S: Storage>(&self, image: &S) -> io::Result<()> {
1004        // TODO: Just write the L1 table offset and size
1005        self.write_v2_header(image).await
1006    }
1007
1008    /// Write the guest disk size to disk.
1009    pub async fn write_size<S: Storage>(&self, image: &S) -> io::Result<()> {
1010        // TODO: Just write the size
1011        self.write_v2_header(image).await
1012    }
1013}
1014
1015impl HeaderExtension {
1016    /// Parse an extension from its type and data.  Unrecognized types are stored as `Unknown`
1017    /// extensions, encountering the end of extensions returns `Ok(None)`.
1018    fn deserialize(ext_type: u32, data: Vec<u8>) -> io::Result<Option<Self>> {
1019        let ext = if let Ok(ext_type) = HeaderExtensionType::try_from(ext_type) {
1020            match ext_type {
1021                HeaderExtensionType::End => return Ok(None),
1022                HeaderExtensionType::BackingFileFormat => {
1023                    let fmt = String::from_utf8(data).map_err(|err| {
1024                        invalid_data(format!("Invalid backing file format: {err}"))
1025                    })?;
1026                    HeaderExtension::BackingFileFormat(fmt)
1027                }
1028                HeaderExtensionType::FeatureNameTable => {
1029                    let mut feats = HashMap::new();
1030                    for feat in data.chunks(48) {
1031                        let feat_type: FeatureType = match feat[0].try_into() {
1032                            Ok(ft) => ft,
1033                            Err(_) => continue, // skip unrecognized entries
1034                        };
1035                        // Cannot use CStr to parse this, as it may not be NUL-terminated.
1036                        // Use this to remove everything from the first NUL byte.
1037                        let feat_name_bytes = feat[2..].split(|c| *c == 0).next().unwrap();
1038                        // Then just use it as a UTF-8 string.
1039                        let feat_name = String::from_utf8_lossy(feat_name_bytes);
1040                        feats.insert((feat_type, feat[1]), feat_name.to_string());
1041                    }
1042                    HeaderExtension::FeatureNameTable(feats)
1043                }
1044                HeaderExtensionType::ExternalDataFileName => {
1045                    let filename = String::from_utf8(data).map_err(|err| {
1046                        invalid_data(format!("Invalid external data file name: {err}"))
1047                    })?;
1048                    HeaderExtension::ExternalDataFileName(filename)
1049                }
1050            }
1051        } else {
1052            HeaderExtension::Unknown {
1053                extension_type: ext_type,
1054                data,
1055            }
1056        };
1057
1058        Ok(Some(ext))
1059    }
1060
1061    /// Return the extension type ID.
1062    fn extension_type(&self) -> u32 {
1063        match self {
1064            HeaderExtension::BackingFileFormat(_) => HeaderExtensionType::BackingFileFormat as u32,
1065            HeaderExtension::FeatureNameTable(_) => HeaderExtensionType::FeatureNameTable as u32,
1066            HeaderExtension::ExternalDataFileName(_) => {
1067                HeaderExtensionType::ExternalDataFileName as u32
1068            }
1069            HeaderExtension::Unknown {
1070                extension_type,
1071                data: _,
1072            } => *extension_type,
1073        }
1074    }
1075
1076    /// Serialize this extension’s data (exclusing its header).
1077    fn serialize_data(&self) -> io::Result<Vec<u8>> {
1078        match self {
1079            HeaderExtension::BackingFileFormat(fmt) => Ok(fmt.as_bytes().into()),
1080            HeaderExtension::FeatureNameTable(map) => {
1081                let mut result = Vec::new();
1082                for (bit, name) in map {
1083                    result.push(bit.0 as u8);
1084                    result.push(bit.1);
1085
1086                    let mut padded_name = vec![0; 46];
1087                    let name_bytes = name.as_bytes();
1088                    // Might truncate in the middle of a multibyte character, but getting that
1089                    // right is complicated and probably not worth it
1090                    let truncated_len = cmp::min(name_bytes.len(), 46);
1091                    padded_name[..truncated_len].copy_from_slice(&name_bytes[..truncated_len]);
1092                    result.extend_from_slice(&padded_name);
1093                }
1094                Ok(result)
1095            }
1096            HeaderExtension::ExternalDataFileName(filename) => Ok(filename.as_bytes().into()),
1097            HeaderExtension::Unknown {
1098                extension_type: _,
1099                data,
1100            } => Ok(data.clone()),
1101        }
1102    }
1103
1104    /// Creates a [`Self::FeatureNameTable`].
1105    fn feature_name_table() -> Self {
1106        use {AutoclearFeatures as A, CompatibleFeatures as C, IncompatibleFeatures as I};
1107
1108        let mut map = HashMap::new();
1109
1110        map.insert(I::Dirty.into(), "dirty".into());
1111        map.insert(I::Corrupt.into(), "corrupt".into());
1112        map.insert(I::ExternalDataFile.into(), "external data file".into());
1113        map.insert(
1114            I::CompressionType.into(),
1115            "extended compression type".into(),
1116        );
1117        map.insert(I::ExtendedL2Entries.into(), "extended L2 entries".into());
1118
1119        map.insert(C::LazyRefcounts.into(), "lazy refcounts".into());
1120
1121        map.insert(A::Bitmaps.into(), "persistent dirty bitmaps".into());
1122        map.insert(A::RawExternalData.into(), "raw external data file".into());
1123
1124        HeaderExtension::FeatureNameTable(map)
1125    }
1126}
1127
1128/// L1 table entry.
1129///
1130/// - Bit 0 - 8: Reserved (set to 0)
1131/// - Bit 9 – 55: Bits 9-55 of the offset into the image file at which the L2 table starts.  Must
1132///   be aligned to a cluster boundary.  If the offset is 0, the L2 table and all clusters
1133///   described by this L2 table are unallocated.
1134/// - Bit 56 - 62: Reserved (set to 0)
1135/// - Bit 63: 0 for an L2 table that is unused or requires COW, 1 if its refcount is exactly one.
1136///   This information is only accurate in the active L1 table.
1137#[derive(Copy, Clone, Default, Debug)]
1138pub(super) struct L1Entry(u64);
1139
1140impl L1Entry {
1141    /// Offset of the L2 table, if any.
1142    pub fn l2_offset(&self) -> Option<HostOffset> {
1143        let ofs = self.0 & 0x00ff_ffff_ffff_fe00u64;
1144        if ofs == 0 {
1145            None
1146        } else {
1147            Some(HostOffset(ofs))
1148        }
1149    }
1150
1151    /// Whether the L2 table’s cluster is “copied”.
1152    ///
1153    /// `true` means is refcount is one, `false` means modifying it will require COW.
1154    pub fn is_copied(&self) -> bool {
1155        self.0 & (1u64 << 63) != 0
1156    }
1157
1158    /// Return all reserved bits.
1159    pub fn reserved_bits(&self) -> u64 {
1160        self.0 & 0x7f00_0000_0000_01feu64
1161    }
1162}
1163
1164impl TableEntry for L1Entry {
1165    fn try_from_plain(value: u64, header: &Header) -> io::Result<Self> {
1166        let entry = L1Entry(value);
1167
1168        if entry.reserved_bits() != 0 {
1169            return Err(invalid_data(format!(
1170                "Invalid L1 entry 0x{value:x}, reserved bits set (0x{:x})",
1171                entry.reserved_bits(),
1172            )));
1173        }
1174
1175        if let Some(l2_ofs) = entry.l2_offset() {
1176            if l2_ofs.in_cluster_offset(header.cluster_bits()) != 0 {
1177                return Err(invalid_data(format!(
1178                    "Invalid L1 entry 0x{value:x}, offset ({l2_ofs}) is not aligned to cluster size (0x{:x})",
1179                    header.cluster_size(),
1180                )));
1181            }
1182        }
1183
1184        Ok(entry)
1185    }
1186
1187    fn to_plain(&self) -> u64 {
1188        self.0
1189    }
1190}
1191
1192/// L1 table.
1193#[derive(Debug)]
1194pub(super) struct L1Table {
1195    /// First cluster in the image file.
1196    cluster: Option<HostCluster>,
1197
1198    /// Table data.
1199    data: Box<[L1Entry]>,
1200
1201    /// log2 of the cluster size.
1202    cluster_bits: u32,
1203
1204    /// Whether this table has been modified since it was last written.
1205    modified: AtomicBool,
1206}
1207
1208impl L1Table {
1209    /// Create a clone that covers at least `at_least_index`.
1210    pub fn clone_and_grow(&self, at_least_index: usize, header: &Header) -> io::Result<Self> {
1211        let new_entry_count = cmp::max(at_least_index + 1, self.data.len());
1212        let new_entry_count =
1213            new_entry_count.next_multiple_of(header.cluster_size() / size_of::<L1Entry>());
1214
1215        if new_entry_count > <Self as Table>::MAX_ENTRIES {
1216            return Err(io::Error::other(
1217                "Cannot grow the image to this size; L1 table would become too big",
1218            ));
1219        }
1220
1221        let mut new_data = vec![L1Entry::default(); new_entry_count];
1222        new_data[..self.data.len()].copy_from_slice(&self.data);
1223
1224        Ok(Self {
1225            cluster: None,
1226            data: new_data.into_boxed_slice(),
1227            cluster_bits: header.cluster_bits(),
1228            modified: true.into(),
1229        })
1230    }
1231
1232    /// Check whether `index` is in bounds.
1233    pub fn in_bounds(&self, index: usize) -> bool {
1234        index < self.data.len()
1235    }
1236
1237    /// Enter the given L2 table into this L1 table.
1238    pub fn enter_l2_table(&mut self, index: usize, l2: &L2Table) -> io::Result<()> {
1239        let l2_offset = l2.get_offset().ok_or_else(|| {
1240            io::Error::new(
1241                io::ErrorKind::InvalidInput,
1242                "L2 table has no assigned offset",
1243            )
1244        })?;
1245
1246        let l1entry = L1Entry((1 << 63) | l2_offset.0);
1247        debug_assert!(l1entry.reserved_bits() == 0);
1248        self.data[index] = l1entry;
1249        self.modified.store(true, Ordering::Relaxed);
1250
1251        Ok(())
1252    }
1253}
1254
1255impl Table for L1Table {
1256    type InternalEntry = L1Entry;
1257    type Entry = L1Entry;
1258    const NAME: &'static str = "L1 table";
1259
1260    /// Maximum number of L1 table entries.
1261    ///
1262    /// Limit taken from QEMU; if QEMU rejects this, we can, too.
1263    const MAX_ENTRIES: usize = 4 * 1024 * 1024;
1264
1265    fn from_data(data: Box<[L1Entry]>, header: &Header) -> Self {
1266        Self {
1267            cluster: None,
1268            data,
1269            cluster_bits: header.cluster_bits(),
1270            modified: true.into(),
1271        }
1272    }
1273
1274    fn entries(&self) -> usize {
1275        self.data.len()
1276    }
1277
1278    fn get_ref(&self, index: usize) -> Option<&L1Entry> {
1279        self.data.get(index)
1280    }
1281
1282    fn get(&self, index: usize) -> L1Entry {
1283        self.data.get(index).copied().unwrap_or(L1Entry(0))
1284    }
1285
1286    fn get_cluster(&self) -> Option<HostCluster> {
1287        self.cluster
1288    }
1289
1290    fn get_offset(&self) -> Option<HostOffset> {
1291        self.cluster.map(|index| index.offset(self.cluster_bits))
1292    }
1293
1294    fn set_cluster(&mut self, cluster: HostCluster) {
1295        self.cluster = Some(cluster);
1296        self.modified.store(true, Ordering::Relaxed);
1297    }
1298
1299    fn unset_cluster(&mut self) {
1300        self.cluster = None;
1301    }
1302
1303    fn is_modified(&self) -> bool {
1304        self.modified.load(Ordering::Relaxed)
1305    }
1306
1307    fn clear_modified(&self) {
1308        self.modified.store(false, Ordering::Relaxed);
1309    }
1310
1311    fn set_modified(&self) {
1312        self.modified.store(true, Ordering::Relaxed);
1313    }
1314
1315    fn cluster_bits(&self) -> u32 {
1316        self.cluster_bits
1317    }
1318}
1319
1320/// L2 table entry.
1321///
1322/// - Bit 0 - 61: Cluster descriptor
1323/// - Bit 62: 0 for standard clusters, 1 for compressed clusters
1324/// - Bit 63: 0 for clusters that are unused, compressed or require COW.  1 for standard clusters
1325///   whose refcount is exactly one.  This information is only accurate in L2 tables that are
1326///   reachable from the active L1 table.  With external data files, all guest clusters have an
1327///   implicit refcount of 1 (because of the fixed host = guest mapping for guest cluster offsets),
1328///   so this bit should be 1 for all allocated clusters.
1329///
1330/// Standard Cluster Descriptor:
1331/// - Bit 0: If set to 1, the cluster reads as all zeros. The host cluster offset can be used to
1332///   describe a preallocation, but it won’t be used for reading data from this cluster, nor is
1333///   data read from the backing file if the cluster is unallocated.  With version 2 or with
1334///   extended L2 entries (see the next section), this is always 0.
1335/// - Bit 1 – 8: Reserved (set to 0)
1336/// - Bit 9 – 55: Bits 9-55 of host cluster offset. Must be aligned to a cluster boundary. If the
1337///   offset is 0 and bit 63 is clear, the cluster is unallocated. The offset may only be 0 with
1338///   bit 63 set (indicating a host cluster offset of 0) when an external data file is used.
1339/// - Bit 56 - 61: Reserved (set to 0)
1340///
1341/// Compressed Cluster Descriptor (`x = 62 - (cluster_bits - 8)`):
1342/// - Bit 0 - x-1: Host cluster offset.  This is usually _not_ aligned to a cluster or sector
1343///   boundary!  If cluster_bits is small enough that this field includes bits beyond 55, those
1344///   upper bits must be set to 0.
1345/// - Bit x - 61: Number of additional 512-byte sectors used for the compressed data, beyond the
1346///   sector containing the offset in the previous field. Some of these sectors may reside in the
1347///   next contiguous host cluster.  Note that the compressed data does not necessarily occupy all
1348///   of the bytes in the final sector; rather, decompression stops when it has produced a cluster
1349///   of data.  Another compressed cluster may map to the tail of the final sector used by this
1350///   compressed cluster.
1351#[derive(Copy, Clone, Default, Debug)]
1352pub(super) struct L2Entry(u64);
1353
1354/// Internal actual type of L2 entries.
1355///
1356/// Using atomic allows flushing L2 tables from the cache while they are write-locked.
1357#[derive(Default, Debug)]
1358pub(super) struct AtomicL2Entry(AtomicU64);
1359
1360/// High-level representation of an L2 entry.
1361#[derive(Debug, Clone)]
1362pub(super) enum L2Mapping {
1363    /// Data is in the data file.
1364    DataFile {
1365        /// Cluster in the data file.
1366        host_cluster: HostCluster,
1367
1368        /// Whether the cluster has a refcount of exactly 1.
1369        copied: bool,
1370    },
1371
1372    /// Data is in the backing file.
1373    Backing {
1374        /// Guest cluster index.
1375        backing_offset: u64,
1376    },
1377
1378    /// Data is zero.
1379    Zero {
1380        /// Preallocated cluster in the data file, if any.
1381        host_cluster: Option<HostCluster>,
1382
1383        /// Whether the preallocated cluster has a refcount of exactly 1.
1384        copied: bool,
1385    },
1386
1387    /// Data is compressed.
1388    Compressed {
1389        /// Offset in the data file.
1390        host_offset: HostOffset,
1391
1392        /// Upper limit on the number of bytes that comprise the compressed data.
1393        length: u64,
1394    },
1395}
1396
1397impl L2Entry {
1398    /// Offset of the data cluster, if any.
1399    ///
1400    /// Assumes the L2 entry references a data cluster, not a compressed cluster.
1401    ///
1402    /// `external_data_file` must be true when using an external data file; in this case, offset 0
1403    /// is a valid offset, and can only be distinguished from “unallocated” by whether the COPIED
1404    /// flag is set or not (which it always is when using an external data file).
1405    pub fn cluster_offset(&self, external_data_file: bool) -> Option<HostOffset> {
1406        let ofs = self.0 & 0x00ff_ffff_ffff_fe00u64;
1407        if ofs != 0 || (external_data_file && self.is_copied()) {
1408            Some(HostOffset(ofs))
1409        } else {
1410            None
1411        }
1412    }
1413
1414    /// Whether the cluster is compressed.
1415    pub fn is_compressed(&self) -> bool {
1416        self.0 & (1u64 << 62) != 0
1417    }
1418
1419    /// Whether the cluster is “copied”.
1420    ///
1421    /// `true` means is refcount is one, `false` means modifying it will require COW.
1422    pub fn is_copied(&self) -> bool {
1423        self.0 & (1u64 << 63) != 0
1424    }
1425
1426    /// Clear “copied” flag.
1427    #[must_use]
1428    pub fn without_copied(self) -> Self {
1429        L2Entry(self.0 & !(1u64 << 63))
1430    }
1431
1432    /// Whether the cluster is a zero cluster.
1433    ///
1434    /// Assumes the L2 entry references a data cluster, not a compressed cluster.
1435    pub fn is_zero(&self) -> bool {
1436        self.0 & (1u64 << 0) != 0
1437    }
1438
1439    /// Return all reserved bits.
1440    pub fn reserved_bits(&self) -> u64 {
1441        if self.is_compressed() {
1442            self.0 & 0x8000_0000_0000_0000u64
1443        } else {
1444            self.0 & 0x3f00_0000_0000_01feu64
1445        }
1446    }
1447
1448    /// Return the full compressed cluster descriptor.
1449    pub fn compressed_descriptor(&self) -> u64 {
1450        self.0 & 0x3fff_ffff_ffff_ffffu64
1451    }
1452
1453    /// If this entry is compressed, return the start host offset and upper limit on the compressed
1454    /// number of bytes.
1455    pub fn compressed_range(&self, cluster_bits: u32) -> Option<(HostOffset, u64)> {
1456        if self.is_compressed() {
1457            let desc = self.compressed_descriptor();
1458            let compressed_offset_bits = 62 - (cluster_bits - 8);
1459            let offset = desc & ((1 << compressed_offset_bits) - 1) & 0x00ff_ffff_ffff_ffffu64;
1460            let sectors = desc >> compressed_offset_bits;
1461            // The first sector is not considered in `sectors`, so we add it and subtract the
1462            // number of bytes there that do not belong to this compressed cluster
1463            let length = (sectors + 1) * 512 - (offset & 511);
1464
1465            Some((HostOffset(offset), length))
1466        } else {
1467            None
1468        }
1469    }
1470
1471    /// If this entry is allocated, return the first host cluster and the number of clusters it
1472    /// references.
1473    ///
1474    /// `external_data_file` must be true when using an external data file.
1475    fn allocation(
1476        &self,
1477        cluster_bits: u32,
1478        external_data_file: bool,
1479    ) -> Option<(HostCluster, ClusterCount)> {
1480        if let Some((offset, length)) = self.compressed_range(cluster_bits) {
1481            // Compressed clusters can cross host cluster boundaries, and thus occupy two clusters
1482            let first_cluster = offset.cluster(cluster_bits);
1483            let cluster_count = ClusterCount::from_byte_size(
1484                offset + length - first_cluster.offset(cluster_bits),
1485                cluster_bits,
1486            );
1487            Some((first_cluster, cluster_count))
1488        } else {
1489            self.cluster_offset(external_data_file)
1490                .map(|ofs| (ofs.cluster(cluster_bits), ClusterCount(1)))
1491        }
1492    }
1493
1494    /// Return the high-level `L2Mapping` representation.
1495    ///
1496    /// `guest_cluster` is the guest cluster being accessed, `cluster_bits` is log2 of the cluster
1497    /// size.  `external_data_file` must be true when using an external data file.
1498    fn into_mapping(
1499        self,
1500        guest_cluster: GuestCluster,
1501        cluster_bits: u32,
1502        external_data_file: bool,
1503    ) -> io::Result<L2Mapping> {
1504        let mapping = if let Some((offset, length)) = self.compressed_range(cluster_bits) {
1505            L2Mapping::Compressed {
1506                host_offset: offset,
1507                length,
1508            }
1509        } else if self.is_zero() {
1510            let host_cluster = self
1511                .cluster_offset(external_data_file)
1512                .map(|ofs| {
1513                    ofs.checked_cluster(cluster_bits).ok_or_else(|| {
1514                        let offset = guest_cluster.offset(cluster_bits);
1515                        io::Error::other(format!(
1516                            "Unaligned pre-allocated zero cluster at {offset}; L2 entry: {self:?}"
1517                        ))
1518                    })
1519                })
1520                .transpose()?;
1521
1522            L2Mapping::Zero {
1523                host_cluster,
1524                copied: host_cluster.is_some() && self.is_copied(),
1525            }
1526        } else if let Some(host_offset) = self.cluster_offset(external_data_file) {
1527            let host_cluster = host_offset.checked_cluster(cluster_bits).ok_or_else(|| {
1528                let offset = guest_cluster.offset(cluster_bits);
1529                io::Error::other(format!(
1530                    "Unaligned data cluster at {offset}; L2 entry: {self:?}"
1531                ))
1532            })?;
1533
1534            L2Mapping::DataFile {
1535                host_cluster,
1536                copied: self.is_copied(),
1537            }
1538        } else {
1539            L2Mapping::Backing {
1540                backing_offset: guest_cluster.offset(cluster_bits).0,
1541            }
1542        };
1543
1544        Ok(mapping)
1545    }
1546
1547    /// Create an L2 entry from its high-level `L2Mapping` representation.
1548    fn from_mapping(value: L2Mapping, cluster_bits: u32) -> Self {
1549        let num_val: u64 = match value {
1550            L2Mapping::DataFile {
1551                host_cluster,
1552                copied,
1553            } => {
1554                debug_assert!(host_cluster.offset(cluster_bits) <= MAX_OFFSET);
1555                if copied {
1556                    (1 << 63) | host_cluster.offset(cluster_bits).0
1557                } else {
1558                    host_cluster.offset(cluster_bits).0
1559                }
1560            }
1561
1562            L2Mapping::Backing { backing_offset: _ } => 0,
1563
1564            L2Mapping::Zero {
1565                host_cluster,
1566                copied,
1567            } => {
1568                let host_offset = host_cluster.map(|hc| hc.offset(cluster_bits));
1569                debug_assert!(host_offset.unwrap_or(HostOffset(0)) <= MAX_OFFSET);
1570                if copied {
1571                    (1 << 63) | host_offset.unwrap().0 | 0x1
1572                } else {
1573                    host_offset.unwrap_or(HostOffset(0)).0 | 0x1
1574                }
1575            }
1576
1577            L2Mapping::Compressed {
1578                host_offset,
1579                length,
1580            } => {
1581                let compressed_offset_bits = 62 - (cluster_bits - 8);
1582                assert!(length < 1 << cluster_bits);
1583                assert!(host_offset.0 < 1 << compressed_offset_bits);
1584
1585                // The first sector is not considered, so we subtract the number of bytes in it
1586                // that belong to this compressed cluster from `length`:
1587                // ceil((length - (512 - (host_offset & 511))) / 512)
1588                // = (length + 511 - 512 + (host_offset & 511)) / 512
1589                let sectors = (length - 1 + (host_offset.0 & 511)) / 512;
1590
1591                (1 << 62) | (sectors << compressed_offset_bits) | host_offset.0
1592            }
1593        };
1594
1595        let entry = L2Entry(num_val);
1596        debug_assert!(entry.reserved_bits() == 0);
1597        entry
1598    }
1599}
1600
1601impl AtomicL2Entry {
1602    /// Get the contained value.
1603    fn get(&self) -> L2Entry {
1604        L2Entry(self.0.load(Ordering::Relaxed))
1605    }
1606
1607    /// Exchange the contained value.
1608    ///
1609    /// # Safety
1610    /// Caller must ensure that:
1611    /// (1) No reader sees invalid intermediate states.
1612    /// (2) Updates are done atomically (do not depend on prior state of the L2 table), or there is
1613    ///     only one writer at a time.
1614    unsafe fn swap(&self, l2e: L2Entry) -> L2Entry {
1615        L2Entry(self.0.swap(l2e.0, Ordering::Relaxed))
1616    }
1617}
1618
1619impl TableEntry for AtomicL2Entry {
1620    fn try_from_plain(value: u64, header: &Header) -> io::Result<Self> {
1621        let entry = L2Entry(value);
1622
1623        if entry.reserved_bits() != 0 {
1624            return Err(invalid_data(format!(
1625                "Invalid L2 entry 0x{value:x}, reserved bits set (0x{:x})",
1626                entry.reserved_bits(),
1627            )));
1628        }
1629
1630        if let Some(offset) = entry.cluster_offset(header.external_data_file()) {
1631            if !entry.is_compressed() && offset.in_cluster_offset(header.cluster_bits()) != 0 {
1632                return Err(invalid_data(format!(
1633                    "Invalid L2 entry 0x{value:x}, offset ({offset}) is not aligned to cluster size (0x{:x})",
1634                    header.cluster_size(),
1635                )));
1636            }
1637        }
1638
1639        Ok(AtomicL2Entry(AtomicU64::new(entry.0)))
1640    }
1641
1642    fn to_plain(&self) -> u64 {
1643        self.get().0
1644    }
1645}
1646
1647impl L2Mapping {
1648    /// Check whether two mappings are consecutive.
1649    ///
1650    /// Given the `preceding` mapping, check whether `self` is consecutive to it, i.e. is the same
1651    /// kind of mapping, and the offsets are consecutive.
1652    pub fn is_consecutive(&self, preceding: &L2Mapping, cluster_bits: u32) -> bool {
1653        match preceding {
1654            L2Mapping::DataFile {
1655                host_cluster: prior_cluster,
1656                copied,
1657            } => {
1658                if let L2Mapping::DataFile {
1659                    host_cluster: next_cluster,
1660                    copied: next_copied,
1661                } = self
1662                {
1663                    *next_cluster == *prior_cluster + ClusterCount(1) && *next_copied == *copied
1664                } else {
1665                    false
1666                }
1667            }
1668
1669            L2Mapping::Backing {
1670                backing_offset: prior_backing_offset,
1671            } => {
1672                let Some(expected_next) = prior_backing_offset.checked_add(1 << cluster_bits)
1673                else {
1674                    return false;
1675                };
1676
1677                if let L2Mapping::Backing {
1678                    backing_offset: next_offset,
1679                } = self
1680                {
1681                    *next_offset == expected_next
1682                } else {
1683                    false
1684                }
1685            }
1686
1687            L2Mapping::Zero {
1688                host_cluster: _,
1689                copied: _,
1690            } => {
1691                // Cluster and copied do not matter; every read is continuous regardless (always
1692                // zero), and every write is, too (always allocate)
1693                matches!(
1694                    self,
1695                    L2Mapping::Zero {
1696                        host_cluster: _,
1697                        copied: _,
1698                    }
1699                )
1700            }
1701
1702            L2Mapping::Compressed {
1703                host_offset: _,
1704                length: _,
1705            } => {
1706                // Not really true, but in practice it is.  Reads need to go through a special
1707                // function anyway, and every write will need COW anyway.
1708                matches!(
1709                    self,
1710                    L2Mapping::Compressed {
1711                        host_offset: _,
1712                        length: _,
1713                    }
1714                )
1715            }
1716        }
1717    }
1718}
1719
1720/// L2 table.
1721#[derive(Debug)]
1722pub(super) struct L2Table {
1723    /// Cluster of the L2 table.
1724    cluster: Option<HostCluster>,
1725
1726    /// Table data.
1727    data: Box<[AtomicL2Entry]>,
1728
1729    /// log2 of the cluster size.
1730    cluster_bits: u32,
1731
1732    /// Whether this image uses an external data file.
1733    external_data_file: bool,
1734
1735    /// Whether this table has been modified since it was last written.
1736    modified: AtomicBool,
1737
1738    /// Lock for creating `L2TableWriteGuard`.
1739    writer_lock: Mutex<()>,
1740}
1741
1742/// Write guard for an L2 table.
1743#[derive(Debug)]
1744pub(super) struct L2TableWriteGuard<'a> {
1745    /// Referenced L2 table.
1746    table: &'a L2Table,
1747
1748    /// Held guard mutex on that L2 table.
1749    _lock: MutexGuard<'a, ()>,
1750}
1751
1752impl L2Table {
1753    /// Create a new zeroed L2 table.
1754    pub fn new_cleared(header: &Header) -> Self {
1755        let mut data = Vec::with_capacity(header.l2_entries());
1756        data.resize_with(header.l2_entries(), Default::default);
1757
1758        L2Table {
1759            cluster: None,
1760            data: data.into_boxed_slice(),
1761            cluster_bits: header.cluster_bits(),
1762            external_data_file: header.external_data_file(),
1763            modified: true.into(),
1764            writer_lock: Default::default(),
1765        }
1766    }
1767
1768    /// Look up a cluster mapping.
1769    pub fn get_mapping(&self, lookup_cluster: GuestCluster) -> io::Result<L2Mapping> {
1770        self.get(lookup_cluster.l2_index(self.cluster_bits))
1771            .into_mapping(lookup_cluster, self.cluster_bits, self.external_data_file)
1772    }
1773
1774    /// Allow modifying this L2 table.
1775    ///
1776    /// Note that readers are allowed to exist while modifications are happening.
1777    pub async fn lock_write(&self) -> L2TableWriteGuard<'_> {
1778        L2TableWriteGuard {
1779            table: self,
1780            _lock: self.writer_lock.lock().await,
1781        }
1782    }
1783}
1784
1785impl L2TableWriteGuard<'_> {
1786    /// Look up a cluster mapping.
1787    pub fn get_mapping(&self, lookup_cluster: GuestCluster) -> io::Result<L2Mapping> {
1788        self.table.get_mapping(lookup_cluster)
1789    }
1790
1791    /// Enter the given raw data cluster mapping into the L2 table.
1792    ///
1793    /// If the previous entry pointed to an allocated cluster, return the old allocation so its
1794    /// refcount can be decreased (offset of the first cluster and number of clusters -- compressed
1795    /// clusters can span across host cluster boundaries).
1796    ///
1797    /// If the allocation is reused, `None` is returned, so this function only returns `Some(_)` if
1798    /// some cluster is indeed leaked.
1799    #[must_use = "Leaked allocation must be freed"]
1800    pub fn map_cluster(
1801        &mut self,
1802        index: usize,
1803        host_cluster: HostCluster,
1804    ) -> Option<(HostCluster, ClusterCount)> {
1805        let new = L2Entry::from_mapping(
1806            L2Mapping::DataFile {
1807                host_cluster,
1808                copied: true,
1809            },
1810            self.table.cluster_bits,
1811        );
1812        // Safe: We set a full valid mapping, and there is only one writer (thanks to
1813        // `L2TableWriteGuard`).
1814        let l2e = unsafe { self.table.data[index].swap(new) };
1815        self.table.modified.store(true, Ordering::Relaxed);
1816
1817        let allocation = l2e.allocation(self.table.cluster_bits, self.table.external_data_file);
1818        if let Some((a_cluster, a_count)) = allocation {
1819            if a_cluster == host_cluster && a_count == ClusterCount(1) {
1820                None
1821            } else {
1822                allocation
1823            }
1824        } else {
1825            None
1826        }
1827    }
1828
1829    /// Make the given index a zero mapping.
1830    ///
1831    /// If `keep_allocation` is true, keep the zero cluster pre-allocated if there is a
1832    /// pre-existing single-cluster allocation (i.e. data cluster or pre-allocated zero cluster).
1833    /// Otherwise, the existing mapping is discarded.
1834    ///
1835    /// If a previous mapping is discarded, return the old allocation so its refcount can be
1836    /// decreased (offset of the first cluster and number of clusters -- compressed clusters can
1837    /// span across host cluster boundaries).
1838    #[must_use = "Leaked allocation must be freed"]
1839    pub fn zero_cluster(
1840        &mut self,
1841        index: usize,
1842        keep_allocation: bool,
1843    ) -> io::Result<Option<(HostCluster, ClusterCount)>> {
1844        let cluster_copied = if keep_allocation {
1845            match self.table.data[index].get().into_mapping(
1846                GuestCluster(0), // only used for backing, which we ignore
1847                self.table.cluster_bits,
1848                self.table.external_data_file,
1849            )? {
1850                L2Mapping::DataFile {
1851                    host_cluster,
1852                    copied,
1853                } => Some((host_cluster, copied)),
1854                L2Mapping::Backing { backing_offset: _ } => None,
1855                L2Mapping::Zero {
1856                    host_cluster: Some(host_cluster),
1857                    copied,
1858                } => Some((host_cluster, copied)),
1859                L2Mapping::Zero {
1860                    host_cluster: None,
1861                    copied: _,
1862                } => None,
1863                L2Mapping::Compressed {
1864                    host_offset: _,
1865                    length: _,
1866                } => None,
1867            }
1868        } else {
1869            None
1870        };
1871
1872        let retained = cluster_copied.is_some();
1873        let new = if let Some((cluster, copied)) = cluster_copied {
1874            L2Mapping::Zero {
1875                host_cluster: Some(cluster),
1876                copied,
1877            }
1878        } else {
1879            L2Mapping::Zero {
1880                host_cluster: None,
1881                copied: false,
1882            }
1883        };
1884        let new = L2Entry::from_mapping(new, self.table.cluster_bits);
1885
1886        // Safe: We set a full valid mapping, and there is only one writer (thanks to
1887        // `L2TableWriteGuard`).
1888        let old = unsafe { self.table.data[index].swap(new) };
1889        self.table.modified.store(true, Ordering::Relaxed);
1890
1891        let leaked = if !retained {
1892            old.allocation(self.table.cluster_bits, self.table.external_data_file)
1893        } else {
1894            None
1895        };
1896        Ok(leaked)
1897    }
1898
1899    /// Remove the given mapping, leaving it empty.
1900    ///
1901    /// If a previous mapping is discarded, return the old allocation so its refcount can be
1902    /// decreased (offset of the first cluster and number of clusters -- compressed clusters can
1903    /// span across host cluster boundaries).
1904    #[must_use = "Leaked allocation must be freed"]
1905    pub fn discard_cluster(&mut self, index: usize) -> Option<(HostCluster, ClusterCount)> {
1906        let new = L2Entry(0);
1907
1908        // Safe: We set a full valid mapping, and there is only one writer (thanks to
1909        // `L2TableWriteGuard`).
1910        let old = unsafe { self.table.data[index].swap(new) };
1911        self.table.modified.store(true, Ordering::Relaxed);
1912
1913        old.allocation(self.table.cluster_bits, self.table.external_data_file)
1914    }
1915}
1916
1917impl Table for L2Table {
1918    type InternalEntry = AtomicL2Entry;
1919    type Entry = L2Entry;
1920    const NAME: &'static str = "L2 table";
1921    const MAX_ENTRIES: usize = MAX_CLUSTER_SIZE / 8;
1922
1923    fn from_data(data: Box<[AtomicL2Entry]>, header: &Header) -> Self {
1924        assert!(data.len() == header.l2_entries());
1925
1926        Self {
1927            cluster: None,
1928            data,
1929            cluster_bits: header.cluster_bits(),
1930            external_data_file: header.external_data_file(),
1931            modified: true.into(),
1932            writer_lock: Default::default(),
1933        }
1934    }
1935
1936    fn entries(&self) -> usize {
1937        self.data.len()
1938    }
1939
1940    fn get_ref(&self, index: usize) -> Option<&AtomicL2Entry> {
1941        self.data.get(index)
1942    }
1943
1944    fn get(&self, index: usize) -> L2Entry {
1945        self.data
1946            .get(index)
1947            .map(|l2e| l2e.get())
1948            .unwrap_or(L2Entry(0))
1949    }
1950
1951    fn get_cluster(&self) -> Option<HostCluster> {
1952        self.cluster
1953    }
1954
1955    fn get_offset(&self) -> Option<HostOffset> {
1956        self.cluster.map(|index| index.offset(self.cluster_bits))
1957    }
1958
1959    fn set_cluster(&mut self, cluster: HostCluster) {
1960        self.cluster = Some(cluster);
1961        self.modified.store(true, Ordering::Relaxed);
1962    }
1963
1964    fn unset_cluster(&mut self) {
1965        self.cluster = None;
1966    }
1967
1968    fn is_modified(&self) -> bool {
1969        self.modified.load(Ordering::Relaxed)
1970    }
1971
1972    fn clear_modified(&self) {
1973        self.modified.store(false, Ordering::Relaxed);
1974    }
1975
1976    fn set_modified(&self) {
1977        self.modified.store(true, Ordering::Relaxed);
1978    }
1979
1980    fn cluster_bits(&self) -> u32 {
1981        self.cluster_bits
1982    }
1983}
1984
1985impl Clone for L2Table {
1986    fn clone(&self) -> Self {
1987        let mut data = Vec::with_capacity(self.data.len());
1988        for entry in &self.data {
1989            // None of these can be `copied`
1990            let entry = entry.get().without_copied();
1991            data.push(AtomicL2Entry(AtomicU64::new(entry.0)));
1992        }
1993
1994        let modified = AtomicBool::new(self.is_modified());
1995
1996        L2Table {
1997            cluster: None,
1998            data: data.into_boxed_slice(),
1999            cluster_bits: self.cluster_bits,
2000            external_data_file: self.external_data_file,
2001            modified,
2002            writer_lock: Default::default(),
2003        }
2004    }
2005}
2006
2007impl Drop for L2Table {
2008    fn drop(&mut self) {
2009        if self.is_modified() {
2010            error!("L2 table dropped while modified; was the image closed before being flushed?");
2011        }
2012    }
2013}
2014
2015/// Refcount table entry.
2016#[derive(Copy, Clone, Default, Debug)]
2017pub(super) struct RefTableEntry(u64);
2018
2019impl RefTableEntry {
2020    /// Offset of the referenced refblock, if any.
2021    pub fn refblock_offset(&self) -> Option<HostOffset> {
2022        let ofs = self.0 & 0xffff_ffff_ffff_fe00u64;
2023        if ofs == 0 {
2024            None
2025        } else {
2026            Some(HostOffset(ofs))
2027        }
2028    }
2029
2030    /// Return all reserved bits.
2031    pub fn reserved_bits(&self) -> u64 {
2032        self.0 & 0x0000_0000_0000_01ffu64
2033    }
2034}
2035
2036impl TableEntry for RefTableEntry {
2037    fn try_from_plain(value: u64, header: &Header) -> io::Result<Self> {
2038        let entry = RefTableEntry(value);
2039
2040        if entry.reserved_bits() != 0 {
2041            return Err(invalid_data(format!(
2042                "Invalid reftable entry 0x{value:x}, reserved bits set (0x{:x})",
2043                entry.reserved_bits(),
2044            )));
2045        }
2046
2047        if let Some(rb_ofs) = entry.refblock_offset() {
2048            if rb_ofs.in_cluster_offset(header.cluster_bits()) != 0 {
2049                return Err(invalid_data(
2050                    format!(
2051                        "Invalid reftable entry 0x{value:x}, offset ({rb_ofs}) is not aligned to cluster size (0x{:x})",
2052                        header.cluster_size(),
2053                    ),
2054                ));
2055            }
2056        }
2057
2058        Ok(entry)
2059    }
2060
2061    fn to_plain(&self) -> u64 {
2062        self.0
2063    }
2064}
2065
2066/// Refcount table.
2067#[derive(Debug)]
2068pub(super) struct RefTable {
2069    /// First cluster in the image file.
2070    cluster: Option<HostCluster>,
2071
2072    /// Table data.
2073    data: Box<[RefTableEntry]>,
2074
2075    /// log2 of the cluster size.
2076    cluster_bits: u32,
2077
2078    /// Whether this table has been modified since it was last written.
2079    modified: AtomicBool,
2080}
2081
2082impl RefTable {
2083    /// Create a clone that covers at least `at_least_index`.
2084    ///
2085    /// Also ensure that beyond `at_least_index`, there are enough entries to self-describe the new
2086    /// refcount table (so that it can actually be allocated).
2087    pub fn clone_and_grow(&self, header: &Header, at_least_index: usize) -> io::Result<Self> {
2088        let cluster_size = header.cluster_size();
2089        let rb_entries = header.rb_entries();
2090
2091        // There surely is an optimal O(1) solution, but probably would look less clear, and this
2092        // is not a hot path.
2093        let mut extra_rbs = 1;
2094        let new_entry_count = loop {
2095            let entry_count = cmp::max(at_least_index + 1 + extra_rbs, self.data.len());
2096            let entry_count = entry_count.next_multiple_of(cluster_size / size_of::<u64>());
2097            let size = entry_count * size_of::<u64>();
2098            // Full number of clusters needed to both the new reftable *and* the `extra_rbs`
2099            let refcount_clusters = size / cluster_size + extra_rbs;
2100            let rbs_needed = refcount_clusters.div_ceil(rb_entries);
2101            if extra_rbs == rbs_needed {
2102                break entry_count;
2103            }
2104            extra_rbs = rbs_needed;
2105        };
2106
2107        if new_entry_count > <Self as Table>::MAX_ENTRIES {
2108            return Err(io::Error::other(
2109                "Cannot grow the image to this size; refcount table would become too big",
2110            ));
2111        }
2112
2113        let mut new_data = vec![RefTableEntry::default(); new_entry_count];
2114        new_data[..self.data.len()].copy_from_slice(&self.data);
2115
2116        Ok(Self {
2117            cluster: None,
2118            data: new_data.into_boxed_slice(),
2119            cluster_bits: header.cluster_bits(),
2120            modified: true.into(),
2121        })
2122    }
2123
2124    /// Check whether `index` is in bounds.
2125    pub fn in_bounds(&self, index: usize) -> bool {
2126        index < self.data.len()
2127    }
2128
2129    /// Enter the given refcount block into this refcount table.
2130    pub fn enter_refblock(&mut self, index: usize, rb: &RefBlock) -> io::Result<()> {
2131        let rb_offset = rb.get_offset().ok_or_else(|| {
2132            io::Error::new(
2133                io::ErrorKind::InvalidInput,
2134                "Refcount block as no assigned offset",
2135            )
2136        })?;
2137
2138        let rt_entry = RefTableEntry(rb_offset.0);
2139        debug_assert!(rt_entry.reserved_bits() == 0);
2140        self.data[index] = rt_entry;
2141        self.modified.store(true, Ordering::Relaxed);
2142
2143        Ok(())
2144    }
2145}
2146
2147impl Table for RefTable {
2148    type InternalEntry = RefTableEntry;
2149    type Entry = RefTableEntry;
2150    const NAME: &'static str = "Refcount table";
2151
2152    /// Maximum number of refcount table entries.
2153    ///
2154    /// Not in QEMU, but makes sense to limit to the same as the L1 table.  Note that refcount
2155    /// blocks usually cover more clusters than an L2 table, so this generally allows larger image
2156    /// files than would be necessary for the maximum guest disk size determined by the maximum
2157    /// number of L1 entries.
2158    const MAX_ENTRIES: usize = <L1Table as Table>::MAX_ENTRIES;
2159
2160    fn from_data(data: Box<[RefTableEntry]>, header: &Header) -> Self {
2161        Self {
2162            cluster: None,
2163            data,
2164            cluster_bits: header.cluster_bits(),
2165            modified: true.into(),
2166        }
2167    }
2168
2169    fn entries(&self) -> usize {
2170        self.data.len()
2171    }
2172
2173    fn get_ref(&self, index: usize) -> Option<&RefTableEntry> {
2174        self.data.get(index)
2175    }
2176
2177    fn get(&self, index: usize) -> RefTableEntry {
2178        self.data.get(index).copied().unwrap_or(RefTableEntry(0))
2179    }
2180
2181    fn get_cluster(&self) -> Option<HostCluster> {
2182        self.cluster
2183    }
2184
2185    fn get_offset(&self) -> Option<HostOffset> {
2186        self.cluster.map(|index| index.offset(self.cluster_bits))
2187    }
2188
2189    fn set_cluster(&mut self, cluster: HostCluster) {
2190        self.cluster = Some(cluster);
2191        self.modified.store(true, Ordering::Relaxed);
2192    }
2193
2194    fn unset_cluster(&mut self) {
2195        self.cluster = None;
2196    }
2197
2198    fn is_modified(&self) -> bool {
2199        self.modified.load(Ordering::Relaxed)
2200    }
2201
2202    fn clear_modified(&self) {
2203        self.modified.store(false, Ordering::Relaxed);
2204    }
2205
2206    fn set_modified(&self) {
2207        self.modified.store(true, Ordering::Relaxed);
2208    }
2209
2210    fn cluster_bits(&self) -> u32 {
2211        self.cluster_bits
2212    }
2213}
2214
2215/// Refcount block.
2216pub(super) struct RefBlock {
2217    /// Cluster in the image file.
2218    cluster: Option<HostCluster>,
2219
2220    /// Raw table data (big endian).
2221    raw_data: IoBuffer,
2222
2223    /// log2 of the refcount bits.
2224    refcount_order: u32,
2225
2226    /// log2 of the cluster size.
2227    cluster_bits: u32,
2228
2229    /// Whether this block has been modified since it was last written.
2230    modified: AtomicBool,
2231
2232    /// Lock for creating `RefBlockWriteGuard`.
2233    writer_lock: Mutex<()>,
2234}
2235
2236/// Write guard for a refblock.
2237pub(super) struct RefBlockWriteGuard<'a> {
2238    /// Referenced refblock.
2239    rb: &'a RefBlock,
2240
2241    /// Held guard mutex on that refblock.
2242    _lock: MutexGuard<'a, ()>,
2243}
2244
2245impl RefBlock {
2246    /// Create a new zeroed refcount block.
2247    pub fn new_cleared<S: Storage>(for_image: &S, header: &Header) -> io::Result<Self> {
2248        let mut raw_data = IoBuffer::new(header.cluster_size(), for_image.mem_align())?;
2249        raw_data.as_mut().into_slice().fill(0);
2250
2251        Ok(RefBlock {
2252            cluster: None,
2253            raw_data,
2254            refcount_order: header.refcount_order(),
2255            cluster_bits: header.cluster_bits(),
2256            modified: true.into(),
2257            writer_lock: Default::default(),
2258        })
2259    }
2260
2261    /// Load a refcount block from disk.
2262    pub async fn load<S: Storage>(
2263        image: &S,
2264        header: &Header,
2265        cluster: HostCluster,
2266    ) -> io::Result<Self> {
2267        let cluster_bits = header.cluster_bits();
2268        let cluster_size = 1 << cluster_bits;
2269        let refcount_order = header.refcount_order();
2270        let offset = cluster.offset(cluster_bits);
2271
2272        check_table(
2273            "Refcount block",
2274            offset.0,
2275            cluster_size,
2276            1,
2277            MAX_CLUSTER_SIZE,
2278            cluster_size,
2279        )?;
2280
2281        let mut raw_data =
2282            IoBuffer::new(cluster_size, cmp::max(image.mem_align(), size_of::<u64>()))?;
2283        image.read(&mut raw_data, offset.0).await?;
2284
2285        Ok(RefBlock {
2286            cluster: Some(cluster),
2287            raw_data,
2288            refcount_order,
2289            cluster_bits,
2290            modified: false.into(),
2291            writer_lock: Default::default(),
2292        })
2293    }
2294
2295    /// Write a refcount block to disk.
2296    pub async fn write<S: Storage>(&self, image: &S) -> io::Result<()> {
2297        let offset = self
2298            .get_offset()
2299            .ok_or_else(|| io::Error::other("Cannot write qcow2 refcount block, no offset set"))?;
2300
2301        self.clear_modified();
2302        if let Err(err) = image.write(self.raw_data.as_ref(), offset.0).await {
2303            self.set_modified();
2304            return Err(err);
2305        }
2306
2307        Ok(())
2308    }
2309
2310    /// Get the block’s cluster in the image file.
2311    pub fn get_cluster(&self) -> Option<HostCluster> {
2312        self.cluster
2313    }
2314
2315    /// Get the block’s offset in the image file.
2316    pub fn get_offset(&self) -> Option<HostOffset> {
2317        self.cluster.map(|index| index.offset(self.cluster_bits))
2318    }
2319
2320    /// Change the block’s cluster in the image file (for writing).
2321    pub fn set_cluster(&mut self, cluster: HostCluster) {
2322        self.cluster = Some(cluster);
2323        self.set_modified();
2324    }
2325
2326    /// Calculate sub-byte refcount access parameters.
2327    ///
2328    /// For a given refcount index, return its:
2329    /// - byte index,
2330    /// - access mask,
2331    /// - in-byte shift.
2332    fn sub_byte_refcount_access(&self, index: usize) -> (usize, u8, usize) {
2333        let order = self.refcount_order;
2334        debug_assert!(order < 3);
2335
2336        // Note that `order` is in bits, i.e. `1 << order` is the number of bits.  `index` is in
2337        // units of refcounts, so `index << order` is the bit index, and `index << (order - 3)` is
2338        // then the byte index, which is equal to `index >> (3 - order)`.
2339        let byte_index = index >> (3 - order);
2340        // `1 << order` is the bits per refcount (bprc), so `(1 << bprc) - 1` is the mask for one
2341        // refcount (its maximum value).
2342        let mask = (1 << (1 << order)) - 1;
2343        // `index` is in units of refcounts, so `index << order` is the bit index.  `% 8`, we get
2344        // the base index inside of a byte.
2345        let shift = (index << order) % 8;
2346
2347        (byte_index, mask, shift)
2348    }
2349
2350    /// Get the given cluster’s refcount.
2351    pub fn get(&self, index: usize) -> u64 {
2352        match self.refcount_order {
2353            // refcount_bits == 1, 2, 4
2354            0..=2 => {
2355                let (index, mask, shift) = self.sub_byte_refcount_access(index);
2356                let raw_data_slice = unsafe { self.raw_data.as_ref().into_typed_slice::<u8>() };
2357                let atomic =
2358                    unsafe { AtomicU8::from_ptr(&raw_data_slice[index] as *const u8 as *mut u8) };
2359                ((atomic.load(Ordering::Relaxed) >> shift) & mask) as u64
2360            }
2361
2362            // refcount_bits == 8
2363            3 => {
2364                let raw_data_slice = unsafe { self.raw_data.as_ref().into_typed_slice::<u8>() };
2365                let atomic =
2366                    unsafe { AtomicU8::from_ptr(&raw_data_slice[index] as *const u8 as *mut u8) };
2367                atomic.load(Ordering::Relaxed) as u64
2368            }
2369
2370            // refcount_bits == 16
2371            4 => {
2372                let raw_data_slice = unsafe { self.raw_data.as_ref().into_typed_slice::<u16>() };
2373                let atomic = unsafe {
2374                    AtomicU16::from_ptr(&raw_data_slice[index] as *const u16 as *mut u16)
2375                };
2376                u16::from_be(atomic.load(Ordering::Relaxed)) as u64
2377            }
2378
2379            // refcount_bits == 32
2380            5 => {
2381                let raw_data_slice = unsafe { self.raw_data.as_ref().into_typed_slice::<u32>() };
2382                let atomic = unsafe {
2383                    AtomicU32::from_ptr(&raw_data_slice[index] as *const u32 as *mut u32)
2384                };
2385                u32::from_be(atomic.load(Ordering::Relaxed)) as u64
2386            }
2387
2388            // refcount_bits == 64
2389            6 => {
2390                let raw_data_slice = unsafe { self.raw_data.as_ref().into_typed_slice::<u64>() };
2391                let atomic = unsafe {
2392                    AtomicU64::from_ptr(&raw_data_slice[index] as *const u64 as *mut u64)
2393                };
2394                u64::from_be(atomic.load(Ordering::Relaxed))
2395            }
2396
2397            _ => unreachable!(),
2398        }
2399    }
2400
2401    /// Allow modifying this refcount block.
2402    ///
2403    /// Note that readers are allowed to exist while modifications are happening.
2404    pub async fn lock_write(&self) -> RefBlockWriteGuard<'_> {
2405        RefBlockWriteGuard {
2406            rb: self,
2407            _lock: self.writer_lock.lock().await,
2408        }
2409    }
2410
2411    /// Check whether this block has been modified since it was last written.
2412    pub fn is_modified(&self) -> bool {
2413        self.modified.load(Ordering::Relaxed)
2414    }
2415
2416    /// Clear the modified flag.
2417    pub fn clear_modified(&self) {
2418        self.modified.store(false, Ordering::Relaxed);
2419    }
2420
2421    /// Set the modified flag.
2422    pub fn set_modified(&self) {
2423        self.modified.store(true, Ordering::Relaxed);
2424    }
2425
2426    /// Check whether the given cluster’s refcount is 0.
2427    pub fn is_zero(&self, index: usize) -> bool {
2428        self.get(index) == 0
2429    }
2430}
2431
2432impl RefBlockWriteGuard<'_> {
2433    /// # Safety
2434    /// Caller must ensure there are no concurrent writers.
2435    unsafe fn fetch_update_bitset(
2436        bitset: &AtomicU8,
2437        change: i64,
2438        base_mask: u8,
2439        shift: usize,
2440    ) -> io::Result<u64> {
2441        let mask = base_mask << shift;
2442
2443        // load + store is OK without concurrent writers
2444        let full = bitset.load(Ordering::Relaxed);
2445        let old = (full & mask) >> shift;
2446        let new = if change > 0 {
2447            let change = change.try_into().map_err(|_| {
2448                io::Error::new(
2449                    io::ErrorKind::InvalidInput,
2450                    format!("Requested refcount change of {change} is too big for the image’s refcount width"),
2451                )
2452            })?;
2453            old.checked_add(change)
2454        } else {
2455            let change = (-change).try_into().map_err(|_| {
2456                io::Error::new(
2457                    io::ErrorKind::InvalidInput,
2458                    format!("Requested refcount change of {change} is too big for the image’s refcount width"),
2459                )
2460            })?;
2461            old.checked_sub(change)
2462        };
2463        let new = new.ok_or_else(|| {
2464            invalid_data(format!(
2465                "Changing refcount from {old} by {change} would overflow"
2466            ))
2467        })?;
2468        if new > base_mask {
2469            return Err(invalid_data(format!(
2470                "Changing refcount from {old} to {new} (by {change}) would overflow"
2471            )));
2472        }
2473
2474        let full = (full & !mask) | (new << shift);
2475        bitset.store(full, Ordering::Relaxed);
2476        Ok(old as u64)
2477    }
2478
2479    /// # Safety
2480    /// Caller must ensure there are no concurrent writers.
2481    unsafe fn fetch_update_full<
2482        T,
2483        L: FnOnce(&T) -> u64,
2484        S: FnOnce(&T, u64) -> Result<(), TryFromIntError>,
2485    >(
2486        atomic: &T,
2487        change: i64,
2488        load: L,
2489        store: S,
2490    ) -> io::Result<u64> {
2491        // load + store is OK without concurrent writers
2492        let old = load(atomic);
2493
2494        let new = if change > 0 {
2495            old.checked_add(change as u64)
2496        } else {
2497            old.checked_sub(-change as u64)
2498        };
2499        let new = new.ok_or_else(|| {
2500            invalid_data(format!(
2501                "Changing refcount from {old} by {change} would overflow"
2502            ))
2503        })?;
2504
2505        store(atomic, new).map_err(|_| {
2506            invalid_data(format!(
2507                "Changing refcount from {old} to {new} (by {change}) would overflow"
2508            ))
2509        })?;
2510
2511        Ok(old)
2512    }
2513
2514    /// Modify the given cluster’s refcount.
2515    fn modify(&mut self, index: usize, change: i64) -> io::Result<u64> {
2516        let result = match self.rb.refcount_order {
2517            // refcount_bits == 1, 2, 4
2518            0..=2 => {
2519                let (index, mask, shift) = self.rb.sub_byte_refcount_access(index);
2520                let raw_data_slice = unsafe { self.rb.raw_data.as_ref().into_typed_slice::<u8>() };
2521                let atomic =
2522                    unsafe { AtomicU8::from_ptr(&raw_data_slice[index] as *const u8 as *mut u8) };
2523                // Safe: `RefBlockWriteGuard` ensures there are no concurrent writers.
2524                unsafe { Self::fetch_update_bitset(atomic, change, mask, shift) }
2525            }
2526
2527            // refcount_bits == 8
2528            3 => {
2529                let raw_data_slice = unsafe { self.rb.raw_data.as_ref().into_typed_slice::<u8>() };
2530                let atomic =
2531                    unsafe { AtomicU8::from_ptr(&raw_data_slice[index] as *const u8 as *mut u8) };
2532                // Safe: `RefBlockWriteGuard` ensures there are no concurrent writers.
2533                unsafe {
2534                    Self::fetch_update_full(
2535                        atomic,
2536                        change,
2537                        |a| a.load(Ordering::Relaxed) as u64,
2538                        |a, v| {
2539                            a.store(v.try_into()?, Ordering::Relaxed);
2540                            Ok(())
2541                        },
2542                    )
2543                }
2544            }
2545
2546            // refcount_bits == 16
2547            4 => {
2548                let raw_data_slice = unsafe { self.rb.raw_data.as_ref().into_typed_slice::<u16>() };
2549                let atomic = unsafe {
2550                    AtomicU16::from_ptr(&raw_data_slice[index] as *const u16 as *mut u16)
2551                };
2552                unsafe {
2553                    Self::fetch_update_full(
2554                        atomic,
2555                        change,
2556                        |a| u16::from_be(a.load(Ordering::Relaxed)) as u64,
2557                        |a, v| {
2558                            a.store(u16::try_from(v)?.to_be(), Ordering::Relaxed);
2559                            Ok(())
2560                        },
2561                    )
2562                }
2563            }
2564
2565            // refcount_bits == 32
2566            5 => {
2567                let raw_data_slice = unsafe { self.rb.raw_data.as_ref().into_typed_slice::<u32>() };
2568                let atomic = unsafe {
2569                    AtomicU32::from_ptr(&raw_data_slice[index] as *const u32 as *mut u32)
2570                };
2571                unsafe {
2572                    Self::fetch_update_full(
2573                        atomic,
2574                        change,
2575                        |a| u32::from_be(a.load(Ordering::Relaxed)) as u64,
2576                        |a, v| {
2577                            a.store(u32::try_from(v)?.to_be(), Ordering::Relaxed);
2578                            Ok(())
2579                        },
2580                    )
2581                }
2582            }
2583
2584            // refcount_bits == 64
2585            6 => {
2586                let raw_data_slice = unsafe { self.rb.raw_data.as_ref().into_typed_slice::<u64>() };
2587                let atomic = unsafe {
2588                    AtomicU64::from_ptr(&raw_data_slice[index] as *const u64 as *mut u64)
2589                };
2590                unsafe {
2591                    Self::fetch_update_full(
2592                        atomic,
2593                        change,
2594                        |a| u64::from_be(a.load(Ordering::Relaxed)),
2595                        |a, v| {
2596                            a.store(v.to_be(), Ordering::Relaxed);
2597                            Ok(())
2598                        },
2599                    )
2600                }
2601            }
2602
2603            _ => unreachable!(),
2604        };
2605
2606        let result = result?;
2607        self.rb.modified.store(true, Ordering::Relaxed);
2608        Ok(result)
2609    }
2610
2611    /// Increment the given cluster’s refcount.
2612    ///
2613    /// Returns the old value.
2614    pub fn increment(&mut self, index: usize) -> io::Result<u64> {
2615        self.modify(index, 1)
2616    }
2617
2618    /// Decrement the given cluster’s refcount.
2619    ///
2620    /// Returns the old value.
2621    pub fn decrement(&mut self, index: usize) -> io::Result<u64> {
2622        self.modify(index, -1)
2623    }
2624
2625    /// Check whether the given cluster’s refcount is 0.
2626    pub fn is_zero(&self, index: usize) -> bool {
2627        self.rb.is_zero(index)
2628    }
2629}
2630
2631impl Drop for RefBlock {
2632    fn drop(&mut self) {
2633        if self.is_modified() {
2634            error!(
2635                "Refcount block dropped while modified; was the image closed before being flushed?"
2636            );
2637        }
2638    }
2639}
2640
2641/// Generic trait for qcow2 table entries (L1, L2, refcount table).
2642pub trait TableEntry
2643where
2644    Self: Sized,
2645{
2646    /// Load the given raw value, checking it for validity.
2647    fn try_from_plain(value: u64, header: &Header) -> io::Result<Self>;
2648
2649    /// Return the contained raw value.
2650    fn to_plain(&self) -> u64;
2651}
2652
2653/// Generic trait for qcow2 metadata tables (L1, L2, refcount table).
2654pub trait Table: Sized {
2655    /// Internal type for each table entry.
2656    type InternalEntry: TableEntry;
2657    /// Externally visible type for each table entry.
2658    type Entry: Copy;
2659    /// User-readable struct name.
2660    const NAME: &'static str;
2661    /// Maximum allowable number of entries.
2662    const MAX_ENTRIES: usize;
2663
2664    /// Create a new table with the given contents
2665    fn from_data(data: Box<[Self::InternalEntry]>, header: &Header) -> Self;
2666
2667    /// Number of entries.
2668    fn entries(&self) -> usize;
2669    /// Get the given entry (as reference).
2670    fn get_ref(&self, index: usize) -> Option<&Self::InternalEntry>;
2671    /// Get the given entry (copied).
2672    fn get(&self, index: usize) -> Self::Entry;
2673    /// Get this table’s (first) cluster in the image file.
2674    fn get_cluster(&self) -> Option<HostCluster>;
2675    /// Get this table’s offset in the image file.
2676    fn get_offset(&self) -> Option<HostOffset>;
2677    /// Set this table’s (first) cluster in the image file (for writing).
2678    fn set_cluster(&mut self, cluster: HostCluster);
2679    /// Remove the table’s association with any cluster in the image file.
2680    fn unset_cluster(&mut self);
2681
2682    /// Return log2 of the cluster size.
2683    ///
2684    /// All tables store this anyway.
2685    fn cluster_bits(&self) -> u32;
2686
2687    /// Check whether this table has been modified since it was last written.
2688    fn is_modified(&self) -> bool;
2689    /// Clear the modified flag.
2690    fn clear_modified(&self);
2691    /// Set the modified flag.
2692    fn set_modified(&self);
2693
2694    /// Table size in bytes.
2695    fn byte_size(&self) -> usize {
2696        self.entries() * size_of::<u64>()
2697    }
2698
2699    /// Number of clusters used by this table.
2700    fn cluster_count(&self) -> ClusterCount {
2701        ClusterCount::from_byte_size(self.byte_size() as u64, self.cluster_bits())
2702    }
2703
2704    /// Load a table from the image file.
2705    async fn load<S: Storage>(
2706        image: &S,
2707        header: &Header,
2708        cluster: HostCluster,
2709        entries: usize,
2710    ) -> io::Result<Self> {
2711        let offset = cluster.offset(header.cluster_bits());
2712
2713        check_table(
2714            Self::NAME,
2715            offset.0,
2716            entries,
2717            size_of::<u64>(),
2718            Self::MAX_ENTRIES,
2719            header.cluster_size(),
2720        )?;
2721
2722        let byte_size = entries * size_of::<u64>();
2723        let mut buffer = IoBuffer::new(byte_size, cmp::max(image.mem_align(), size_of::<u64>()))?;
2724
2725        image.read(&mut buffer, offset.0).await?;
2726
2727        // Safe because `u64` is a plain type, and the alignment fits
2728        let raw_table = unsafe { buffer.as_ref().into_typed_slice::<u64>() };
2729
2730        let mut table = Vec::<Self::InternalEntry>::with_capacity(entries);
2731        for be_value in raw_table {
2732            table.push(Self::InternalEntry::try_from_plain(
2733                u64::from_be(*be_value),
2734                header,
2735            )?)
2736        }
2737
2738        let mut table = Self::from_data(table.into_boxed_slice(), header);
2739        table.set_cluster(cluster);
2740        table.clear_modified();
2741        Ok(table)
2742    }
2743
2744    /// Write a table to the image file.
2745    ///
2746    /// Callers must ensure the table is copied, i.e. its refcount is 1.
2747    async fn write<S: Storage>(&self, image: &S) -> io::Result<()> {
2748        let offset = self
2749            .get_offset()
2750            .ok_or_else(|| io::Error::other("Cannot write qcow2 metadata table, no offset set"))?;
2751
2752        check_table(
2753            Self::NAME,
2754            offset.0,
2755            self.entries(),
2756            size_of::<u64>(),
2757            Self::MAX_ENTRIES,
2758            1 << self.cluster_bits(),
2759        )?;
2760
2761        let byte_size = self.byte_size();
2762        let mut buffer = IoBuffer::new(byte_size, cmp::max(image.mem_align(), size_of::<u64>()))?;
2763
2764        self.clear_modified();
2765
2766        // Safe because we have just allocated this, and it fits the alignment
2767        let raw_table = unsafe { buffer.as_mut().into_typed_slice::<u64>() };
2768        for (i, be_value) in raw_table.iter_mut().enumerate() {
2769            // 0 always works, that’s by design.
2770            *be_value = self.get_ref(i).map(|e| e.to_plain()).unwrap_or(0).to_be();
2771        }
2772
2773        if let Err(err) = image.write(&buffer, offset.0).await {
2774            self.set_modified();
2775            return Err(err);
2776        }
2777
2778        Ok(())
2779    }
2780
2781    /// Write at least the given single (modified) entry to the image file.
2782    ///
2783    /// Potentially writes more of the table, if alignment requirements ask for that.
2784    async fn write_entry<S: Storage>(&self, image: &S, index: usize) -> io::Result<()> {
2785        // This alignment calculation code implicitly assumes that the cluster size is aligned to
2786        // the storage’s request/memory alignment, but that is often fair.  If that is not the
2787        // case, there is not much we can do anyway.
2788        let byte_size = self.byte_size();
2789        let power_of_two_up_to_byte_size = ((byte_size / 2) + 1).next_power_of_two();
2790        let alignment = cmp::min(
2791            power_of_two_up_to_byte_size,
2792            cmp::max(
2793                cmp::max(image.mem_align(), image.req_align()),
2794                size_of::<u64>(),
2795            ),
2796        );
2797        let alignment_in_entries = alignment / size_of::<u64>();
2798
2799        let offset = self
2800            .get_offset()
2801            .ok_or_else(|| io::Error::other("Cannot write qcow2 metadata table, no offset set"))?;
2802
2803        check_table(
2804            Self::NAME,
2805            offset.0,
2806            self.entries(),
2807            size_of::<u64>(),
2808            Self::MAX_ENTRIES,
2809            1 << self.cluster_bits(),
2810        )?;
2811
2812        let mut buffer = IoBuffer::new(alignment, cmp::max(image.mem_align(), size_of::<u64>()))?;
2813
2814        // Safe because we have just allocated this, and it fits the alignment
2815        let raw_entries = unsafe { buffer.as_mut().into_typed_slice::<u64>() };
2816        let first_index = (index / alignment_in_entries) * alignment_in_entries;
2817        #[allow(clippy::needless_range_loop)]
2818        for i in 0..alignment_in_entries {
2819            // 0 always works, that’s by design.
2820            raw_entries[i] = self
2821                .get_ref(first_index + i)
2822                .map(|e| e.to_plain())
2823                .unwrap_or(0)
2824                .to_be();
2825        }
2826
2827        image
2828            .write(&buffer, offset.0 + (first_index * size_of::<u64>()) as u64)
2829            .await
2830    }
2831}
2832
2833/// Check whether the given table offset/size is valid.
2834///
2835/// Also works for refcount blocks (with cheating, because their entry size can be less than a
2836/// byte), which is why it is outside of [`Table`].
2837fn check_table(
2838    name: &str,
2839    offset: u64,
2840    entries: usize,
2841    entry_size: usize,
2842    max_entries: usize,
2843    cluster_size: usize,
2844) -> io::Result<()> {
2845    if entries > max_entries {
2846        return Err(invalid_data(format!(
2847            "{name} too big: {entries} > {max_entries}",
2848        )));
2849    }
2850
2851    if !offset.is_multiple_of(cluster_size as u64) {
2852        return Err(invalid_data(format!("{name}: Unaligned offset: {offset}")));
2853    }
2854
2855    let byte_size = entries
2856        .checked_mul(entry_size)
2857        .ok_or_else(|| invalid_data(format!("{name} size overflow: {entries} * {entry_size}")))?;
2858    let end_offset = offset
2859        .checked_add(byte_size as u64)
2860        .ok_or_else(|| invalid_data(format!("{name} offset overflow: {offset} + {byte_size}")))?;
2861    if end_offset > MAX_FILE_LENGTH {
2862        return Err(invalid_data(format!(
2863            "{name}: Invalid end offset: {end_offset} > {MAX_FILE_LENGTH}"
2864        )));
2865    }
2866
2867    Ok(())
2868}