imago/qcow2/
allocation.rs

1//! Cluster allocation.
2//!
3//! Functionality for allocating single clusters and ranges of clusters, and general handling of
4//! refcount structures.
5
6use super::*;
7use std::mem;
8use tokio::sync::MutexGuard;
9use tracing::{event, warn, Level};
10
11/// Central facility for cluster allocation.
12pub(super) struct Allocator<S: Storage> {
13    /// Qcow2 metadata file.
14    file: Arc<S>,
15
16    /// Qcow2 refcount table.
17    reftable: RefTable,
18
19    /// The first free cluster index in the qcow2 file, to speed up allocation.
20    first_free_cluster: HostCluster,
21
22    /// Qcow2 image header.
23    header: Arc<Header>,
24
25    /// L2 and refblock caches with dependency management.
26    caches: Arc<MetadataCaches<S>>,
27}
28
29impl<S: Storage + 'static, F: WrappedFormat<S> + 'static> Qcow2<S, F> {
30    /// Return the central allocator instance.
31    ///
32    /// Returns an error for read-only images.
33    async fn allocator(&self) -> io::Result<MutexGuard<'_, Allocator<S>>> {
34        Ok(self
35            .allocator
36            .as_ref()
37            .ok_or_else(|| io::Error::other("Image is read-only"))?
38            .lock()
39            .await)
40    }
41
42    /// Allocate one metadata cluster.
43    ///
44    /// Metadata clusters are allocated exclusively in the metadata (image) file.
45    pub(super) async fn allocate_meta_cluster(&self) -> io::Result<HostCluster> {
46        self.allocate_meta_clusters(ClusterCount(1)).await
47    }
48
49    /// Allocate multiple continuous metadata clusters.
50    ///
51    /// Useful e.g. for the L1 table or refcount table.
52    pub(super) async fn allocate_meta_clusters(
53        &self,
54        count: ClusterCount,
55    ) -> io::Result<HostCluster> {
56        self.allocator().await?.allocate_clusters(count, None).await
57    }
58
59    /// Allocate one data clusters for the given guest cluster.
60    ///
61    /// Without an external data file, data clusters are allocated in the image file, just like
62    /// metadata clusters.
63    ///
64    /// With an external data file, data clusters aren’t really allocated, but just put there at
65    /// the same offset as their guest offset.  Their refcount is not tracked by the qcow2 metadata
66    /// structures (which only cover the metadata (image) file).
67    pub(super) async fn allocate_data_cluster(
68        &self,
69        guest_cluster: GuestCluster,
70    ) -> io::Result<HostCluster> {
71        if self.header.external_data_file() {
72            Ok(HostCluster(guest_cluster.0))
73        } else {
74            let mut allocator = self.allocator().await?;
75
76            // Allocate clusters before setting up L2 entries
77            self.caches.l2_depends_on_rb().await?;
78
79            allocator.allocate_clusters(ClusterCount(1), None).await
80        }
81    }
82
83    /// Allocate the data cluster with the given index.
84    ///
85    /// Without a `mandatory_host_cluster` given, this is the same as
86    /// [`Qcow2::allocate_data_cluster()`].
87    ///
88    /// With a `mandatory_host_cluster` given, try to allocate that cluster.  If that is not
89    /// possible because it is already allocated, return `Ok(None)`.
90    pub(super) async fn allocate_data_cluster_at(
91        &self,
92        guest_cluster: GuestCluster,
93        mandatory_host_cluster: Option<HostCluster>,
94    ) -> io::Result<Option<HostCluster>> {
95        let Some(mandatory_host_cluster) = mandatory_host_cluster else {
96            return self.allocate_data_cluster(guest_cluster).await.map(Some);
97        };
98
99        if self.header.external_data_file() {
100            let cluster = HostCluster(guest_cluster.0);
101            Ok((cluster == mandatory_host_cluster).then_some(cluster))
102        } else {
103            let mut allocator = self.allocator().await?;
104
105            // Allocate clusters before setting up L2 entries
106            self.caches.l2_depends_on_rb().await?;
107
108            let cluster = allocator
109                .allocate_cluster_at(mandatory_host_cluster)
110                .await?
111                .then_some(mandatory_host_cluster);
112            Ok(cluster)
113        }
114    }
115
116    /// Free metadata clusters (i.e. decrement their refcount).
117    ///
118    /// Best-effort operation.  On error, the given clusters may be leaked, but no errors are ever
119    /// returned (because there is no good way to handle such errors anyway).
120    pub(super) async fn free_meta_clusters(&self, cluster: HostCluster, count: ClusterCount) {
121        if let Ok(mut allocator) = self.allocator().await {
122            allocator.free_clusters(cluster, count).await
123        }
124    }
125
126    /// Free data clusters (i.e. decrement their refcount).
127    ///
128    /// Best-effort operation.  On error, the given clusters may be leaked, but no errors are ever
129    /// returned (because there is no good way to handle such errors anyway).
130    pub(super) async fn free_data_clusters(&self, cluster: HostCluster, count: ClusterCount) {
131        if !self.header.external_data_file() {
132            if let Ok(mut allocator) = self.allocator().await {
133                // Clear L2 entries before deallocating cluster
134                if let Err(err) = self.caches.rb_depends_on_l2().await {
135                    warn!("Leaking clusters; cannot set up cache dependency: {err}");
136                    return;
137                }
138
139                allocator.free_clusters(cluster, count).await;
140            }
141        }
142    }
143}
144
145impl<S: Storage> Allocator<S> {
146    /// Create a new allocator for the given image file.
147    pub async fn new(
148        image: Arc<S>,
149        header: Arc<Header>,
150        caches: Arc<MetadataCaches<S>>,
151    ) -> io::Result<Self> {
152        let cb = header.cluster_bits();
153        let rt_offset = header.reftable_offset();
154        let rt_cluster = rt_offset
155            .checked_cluster(cb)
156            .ok_or_else(|| invalid_data(format!("Unaligned refcount table: {rt_offset}")))?;
157
158        let reftable = RefTable::load(
159            image.as_ref(),
160            &header,
161            rt_cluster,
162            header.reftable_entries(),
163        )
164        .await?;
165
166        Ok(Allocator {
167            file: image,
168            reftable,
169            first_free_cluster: HostCluster(0),
170            header,
171            caches,
172        })
173    }
174
175    /// Invaidate the refcount block cache.
176    ///
177    /// # Safety
178    /// May cause image corruption, you must guarantee the on-disk state is consistent.
179    pub async unsafe fn invalidate_rb_cache(&self) -> io::Result<()> {
180        // Safe: Caller says so.
181        unsafe { self.caches.invalidate_rb() }.await
182    }
183
184    /// Allocate clusters in the image file.
185    ///
186    /// `end_cluster` should only be used when allocating refblocks.  When reaching this cluster
187    /// index, abort trying to allocate.  (This is used for allocating refblocks, to prevent
188    /// infinite recursion and speed things up.)
189    async fn allocate_clusters(
190        &mut self,
191        count: ClusterCount,
192        end_cluster: Option<HostCluster>,
193    ) -> io::Result<HostCluster> {
194        let mut index = self.first_free_cluster;
195        loop {
196            if end_cluster == Some(index) {
197                return Err(io::Error::other("Maximum cluster index reached"));
198            }
199
200            let alloc_count = self.allocate_clusters_at(index, count).await?;
201            if alloc_count == count {
202                return Ok(index);
203            }
204
205            index += alloc_count + ClusterCount(1);
206            if index.offset(self.header.cluster_bits()) > MAX_OFFSET {
207                return Err(io::Error::other("Cannot grow qcow2 file any further"));
208            }
209        }
210    }
211
212    /// Allocate the given clusters in the image file.
213    ///
214    /// Allocate up to `count` unallocated clusters starting from `index`.  When encountering an
215    /// already allocated cluster (or any other error), stop, and free the clusters that were just
216    /// newly allocated.
217    ///
218    /// Returns the number of clusters that could be allocated (starting from `index`), which may
219    /// be 0 if `index` has already been allocated.  Note again that in case this is less than
220    /// `count`, those clusters will have been freed again already, so this is just a hint to
221    /// callers that the cluster at `index + count` is already allocated.
222    async fn allocate_clusters_at(
223        &mut self,
224        mut index: HostCluster,
225        mut count: ClusterCount,
226    ) -> io::Result<ClusterCount> {
227        let start_index = index;
228
229        while count > ClusterCount(0) {
230            // Note that `ensure_rb()` in `allocate_cluster_at()` may allocate clusters (new
231            // refblocks), and also a new refcount table.  This can interfere with us allocating a
232            // large continuous region like so (A is our allocation, R is a refblock, imagine a
233            // refblock covers four clusters):
234            //
235            // |AAAA| -- allocated four clusters need new refblock
236            // |AAAA|R   | -- made refblock self-describing, but now allocation cannot go on
237            //
238            // This gets resolved by us retrying, and future refblocks using the region that has
239            // now become free but already has refblocks to cover it:
240            //
241            // |    |RAAA| -- retry after refblock; need a new refblock again
242            // |R   |RAAA|AAAA| -- the new refblock allocates itself in the region we abandoned
243            //
244            // However, eventually, the new refblocks will run into the new start of our allocation
245            // again:
246            //
247            // |RRRR|RAAA|AAAA|AAAA|AAAA|AAAA| -- need new refblock
248            // |RRRR|RAAA|AAAA|AAAA|AAAA|AAAA|R   | -- allocation cannot go on, again
249            // |RRRR|R   |    |    |    |    |RAAA| -- another attempt
250            // |RRRR|RRRR|R...|    |    |    |RAAA|AAAA|AAAA|AAAA|AAAA|...
251            //
252            // As you can see, the hole we leave behind gets larger each time.  So eventually, this
253            // must converge.
254            //
255            // The same applies to the refcount table being allocated instead of just refblocks.
256
257            let result = self.allocate_cluster_at(index).await;
258            if !matches!(result, Ok(true)) {
259                // Already allocated, or some real error occurred; free everything allocated so far
260                self.free_clusters(start_index, index - start_index).await;
261                return result.map(|_| index - start_index);
262            }
263
264            count -= ClusterCount(1);
265            index += ClusterCount(1);
266        }
267
268        Ok(index - start_index)
269    }
270
271    /// Allocate the given cluster in the image file.
272    ///
273    /// Return `Ok(true)` if allocation was successful, or `Ok(false)` if the cluster was already
274    /// allocated before.
275    async fn allocate_cluster_at(&mut self, index: HostCluster) -> io::Result<bool> {
276        let rb_bits = self.header.rb_bits();
277        let (rt_index, rb_index) = index.rt_rb_indices(rb_bits);
278
279        let rb = self.ensure_rb(rt_index).await?;
280        let mut rb = rb.lock_write().await;
281        let can_allocate = rb.is_zero(rb_index);
282        if can_allocate {
283            rb.increment(rb_index)?;
284        }
285
286        // We now know this is allocated
287        if index == self.first_free_cluster {
288            self.first_free_cluster = index + ClusterCount(1);
289        }
290
291        Ok(can_allocate)
292    }
293
294    /// Get the refblock referenced by the given reftable index, if any.
295    ///
296    /// If there is no refblock for the given reftable index, return `Ok(None)`.
297    async fn get_rb(&mut self, rt_index: usize) -> io::Result<Option<Arc<RefBlock>>> {
298        let rt_entry = self.reftable.get(rt_index);
299        if let Some(rb_offset) = rt_entry.refblock_offset() {
300            let cb = self.header.cluster_bits();
301            let rb_cluster = rb_offset.checked_cluster(cb).ok_or_else(|| {
302                invalid_data(format!("Unaligned refcount block with index {rt_index}; refcount table entry: {rt_entry:?}"))
303            })?;
304
305            self.caches.rb_get_or_insert(rb_cluster).await.map(Some)
306        } else {
307            Ok(None)
308        }
309    }
310
311    /// Get a refblock for the given reftable index.
312    ///
313    /// If there already is a refblock at that index, return it.  Otherwise, create one and hook it
314    /// up.
315    async fn ensure_rb(&mut self, rt_index: usize) -> io::Result<Arc<RefBlock>> {
316        if let Some(rb) = self.get_rb(rt_index).await? {
317            return Ok(rb);
318        }
319
320        if !self.reftable.in_bounds(rt_index) {
321            self.grow_reftable(rt_index).await?;
322            // `grow_reftable` will allocate new refblocks, so check the index again
323            if let Some(rb) = self.get_rb(rt_index).await? {
324                return Ok(rb);
325            }
326        }
327
328        let mut new_rb = RefBlock::new_cleared(self.file.as_ref(), &self.header)?;
329
330        // This is the first cluster covered by the new refblock
331        let rb_cluster = HostCluster::from_ref_indices(rt_index, 0, self.header.rb_bits());
332
333        // Try to allocate a cluster in the already existing refcount structures.
334        // By stopping looking for clusters at `rb_cluster`, we ensure that we will not land here
335        // in this exact function again, trying to allocate the very same refblock (it is possible
336        // we allocate one before the current one, though), and so prevent any possible infinite
337        // recursion.
338        // Recursion is possible, though, so the future must be boxed.
339        // false`), so must be boxed.
340        if let Ok(new_rb_cluster) =
341            Box::pin(self.allocate_clusters(ClusterCount(1), Some(rb_cluster))).await
342        {
343            new_rb.set_cluster(new_rb_cluster);
344        } else {
345            // Place the refblock such that it covers itself
346            new_rb.set_cluster(rb_cluster);
347            new_rb.lock_write().await.increment(0)?;
348        }
349        new_rb.write(self.file.as_ref()).await?;
350
351        self.reftable.enter_refblock(rt_index, &new_rb)?;
352        self.reftable
353            .write_entry(self.file.as_ref(), rt_index)
354            .await?;
355
356        let new_rb = Arc::new(new_rb);
357        self.caches
358            .rb_insert(new_rb.get_cluster().unwrap(), Arc::clone(&new_rb))
359            .await?;
360        Ok(new_rb)
361    }
362
363    /// Create a new refcount table covering at least `at_least_index`.
364    ///
365    /// Create a new reftable of the required size, copy all existing refblock references into it,
366    /// ensure it is refcounted itself (also creating new refblocks if necessary), and have the
367    /// image header reference the new refcount table.
368    async fn grow_reftable(&mut self, at_least_index: usize) -> io::Result<()> {
369        let cb = self.header.cluster_bits();
370        let rb_bits = self.header.rb_bits();
371        let rb_entries = 1 << rb_bits;
372
373        let mut new_rt = self.reftable.clone_and_grow(&self.header, at_least_index)?;
374        let rt_clusters = ClusterCount::from_byte_size(new_rt.byte_size() as u64, cb);
375
376        // Find free range
377        let (mut rt_index, mut rb_index) = self.first_free_cluster.rt_rb_indices(rb_bits);
378        let mut free_cluster_index: Option<HostCluster> = None;
379        let mut free_cluster_count = ClusterCount(0);
380
381        // Number of clusters required to allocate both the new reftable and all new refblocks.
382        // Note that `clone_and_grow()` *guarantees* we can fit the final count in there.
383        let mut required_clusters = rt_clusters;
384
385        while free_cluster_count < required_clusters {
386            // `clone_and_grow()` guarantees it can fit
387            assert!(new_rt.in_bounds(rt_index));
388
389            let rt_entry = new_rt.get(rt_index);
390            let Some(rb_offset) = rt_entry.refblock_offset() else {
391                let start_index = HostCluster::from_ref_indices(rt_index, 0, rb_bits);
392                free_cluster_index.get_or_insert(start_index);
393                free_cluster_count += ClusterCount(rb_entries as u64);
394                // Need to allocate this RB
395                required_clusters += ClusterCount(1);
396                continue;
397            };
398
399            let rb_cluster = rb_offset.checked_cluster(cb).ok_or_else(|| {
400                invalid_data(format!("Unaligned refcount block with index {rt_index}; refcount table entry: {rt_entry:?}"))
401            })?;
402
403            let rb = self.caches.rb_get_or_insert(rb_cluster).await?;
404            for i in rb_index..rb_entries {
405                if rb.is_zero(i) {
406                    let index = HostCluster::from_ref_indices(rt_index, i, rb_bits);
407                    free_cluster_index.get_or_insert(index);
408                    free_cluster_count += ClusterCount(1);
409
410                    if free_cluster_count >= required_clusters {
411                        break;
412                    }
413                } else if free_cluster_index.is_some() {
414                    free_cluster_index.take();
415                    free_cluster_count = ClusterCount(0);
416                    required_clusters = rt_clusters; // reset
417                }
418            }
419
420            rb_index = 0;
421            rt_index += 1;
422        }
423
424        let mut index = free_cluster_index.unwrap();
425        let mut count = required_clusters;
426
427        // Put refblocks first
428        let rt_index_start = index.rt_index(rb_bits);
429        let rt_index_end = (index + count).0.div_ceil(rb_entries as u64) as usize;
430
431        let mut refblocks = Vec::<Arc<RefBlock>>::new();
432        for rt_i in rt_index_start..rt_index_end {
433            if let Some(rb_offset) = new_rt.get(rt_i).refblock_offset() {
434                // Checked in the loop above
435                let rb_cluster = rb_offset.checked_cluster(cb).unwrap();
436                let rb = self.caches.rb_get_or_insert(rb_cluster).await?;
437                refblocks.push(rb);
438                continue;
439            }
440
441            let mut rb = RefBlock::new_cleared(self.file.as_ref(), &self.header)?;
442            rb.set_cluster(index);
443            new_rt.enter_refblock(rt_i, &rb)?;
444            let rb = Arc::new(rb);
445            self.caches.rb_insert(index, Arc::clone(&rb)).await?;
446            refblocks.push(rb);
447            index += ClusterCount(1);
448            count -= ClusterCount(1);
449        }
450
451        assert!(count >= rt_clusters);
452        new_rt.set_cluster(index);
453
454        // Now set allocation information
455        let start_index = free_cluster_index.unwrap();
456        let end_index = index + rt_clusters;
457
458        for index in start_index.0..end_index.0 {
459            let index = HostCluster(index);
460            let (rt_i, rb_i) = index.rt_rb_indices(rb_bits);
461
462            // `refblocks[0]` is for `rt_index_start`
463            let rb_vec_i = rt_i - rt_index_start;
464            // Incrementing from 0 to 1 must succeed
465            refblocks[rb_vec_i]
466                .lock_write()
467                .await
468                .increment(rb_i)
469                .unwrap();
470        }
471
472        // Any errors from here on may lead to leaked clusters if there are refblocks in
473        // `refblocks` that are already part of the old reftable.
474        // TODO: Try to clean that up, though it seems quite hard for little gain.
475        self.caches.flush_rb().await?;
476        new_rt.write(self.file.as_ref()).await?;
477
478        self.header.set_reftable(&new_rt)?;
479        self.header
480            .write_reftable_pointer(self.file.as_ref())
481            .await?;
482
483        // Must set new reftable before calling `free_clusters()`
484        let mut old_reftable = mem::replace(&mut self.reftable, new_rt);
485        if let Some(old_rt_cluster) = old_reftable.get_cluster() {
486            let old_rt_size = old_reftable.cluster_count();
487            old_reftable.unset_cluster();
488            self.free_clusters(old_rt_cluster, old_rt_size).await;
489        }
490
491        Ok(())
492    }
493
494    /// Free clusters (i.e. decrement their refcount).
495    ///
496    /// Best-effort operation.  On error, the given clusters may be leaked, but no errors are ever
497    /// returned (because there is no good way to handle such errors anyway).
498    async fn free_clusters(&mut self, start: HostCluster, mut count: ClusterCount) {
499        if count.0 == 0 {
500            return;
501        }
502
503        if start < self.first_free_cluster {
504            self.first_free_cluster = start;
505        }
506
507        let rb_bits = self.header.rb_bits();
508        let rb_entries = 1 << rb_bits;
509        let (mut rt_index, mut rb_index) = start.rt_rb_indices(rb_bits);
510
511        while count > ClusterCount(0) {
512            let in_rb_count = cmp::min((rb_entries - rb_index) as u64, count.0) as usize;
513
514            match self.get_rb(rt_index).await {
515                Ok(Some(rb)) => {
516                    let mut rb = rb.lock_write().await;
517                    for i in rb_index..(rb_index + in_rb_count) {
518                        if let Err(err) = rb.decrement(i) {
519                            event!(Level::WARN, "Failed to free cluster: {err}");
520                        }
521                    }
522                }
523
524                Ok(None) => {
525                    event!(
526                        Level::WARN,
527                        "Failed to free {in_rb_count} clusters: Not allocated"
528                    )
529                }
530                Err(err) => event!(Level::WARN, "Failed to free {in_rb_count} clusters: {err}"),
531            }
532
533            count -= ClusterCount(in_rb_count as u64);
534            rb_index = 0;
535            rt_index += 1;
536        }
537    }
538}