imago/qcow2/
allocation.rs

1//! Cluster allocation.
2//!
3//! Functionality for allocating single clusters and ranges of clusters, and general handling of
4//! refcount structures.
5
6use super::cache::RefBlockCacheBackend;
7use super::*;
8use std::mem;
9use tokio::sync::MutexGuard;
10use tracing::{event, warn, Level};
11
12/// Central facility for cluster allocation.
13pub(super) struct Allocator<S: Storage> {
14    /// Qcow2 metadata file.
15    file: Arc<S>,
16
17    /// Qcow2 refcount table.
18    reftable: RefTable,
19
20    /// The first free cluster index in the qcow2 file, to speed up allocation.
21    first_free_cluster: HostCluster,
22
23    /// Qcow2 image header.
24    header: Arc<Header>,
25
26    /// Refblock cache.
27    rb_cache: AsyncLruCache<HostCluster, RefBlock, RefBlockCacheBackend<S>>,
28}
29
30impl<S: Storage + 'static, F: WrappedFormat<S> + 'static> Qcow2<S, F> {
31    /// Return the central allocator instance.
32    ///
33    /// Returns an error for read-only images.
34    async fn allocator(&self) -> io::Result<MutexGuard<'_, Allocator<S>>> {
35        Ok(self
36            .allocator
37            .as_ref()
38            .ok_or_else(|| io::Error::other("Image is read-only"))?
39            .lock()
40            .await)
41    }
42
43    /// Allocate one metadata cluster.
44    ///
45    /// Metadata clusters are allocated exclusively in the metadata (image) file.
46    pub(super) async fn allocate_meta_cluster(&self) -> io::Result<HostCluster> {
47        self.allocate_meta_clusters(ClusterCount(1)).await
48    }
49
50    /// Allocate multiple continuous metadata clusters.
51    ///
52    /// Useful e.g. for the L1 table or refcount table.
53    pub(super) async fn allocate_meta_clusters(
54        &self,
55        count: ClusterCount,
56    ) -> io::Result<HostCluster> {
57        self.allocator().await?.allocate_clusters(count, None).await
58    }
59
60    /// Allocate one data clusters for the given guest cluster.
61    ///
62    /// Without an external data file, data clusters are allocated in the image file, just like
63    /// metadata clusters.
64    ///
65    /// With an external data file, data clusters aren’t really allocated, but just put there at
66    /// the same offset as their guest offset.  Their refcount is not tracked by the qcow2 metadata
67    /// structures (which only cover the metadata (image) file).
68    pub(super) async fn allocate_data_cluster(
69        &self,
70        guest_cluster: GuestCluster,
71    ) -> io::Result<HostCluster> {
72        if self.header.external_data_file() {
73            Ok(HostCluster(guest_cluster.0))
74        } else {
75            let mut allocator = self.allocator().await?;
76
77            // Allocate clusters before setting up L2 entries
78            self.l2_cache.depend_on(&allocator.rb_cache).await?;
79
80            allocator.allocate_clusters(ClusterCount(1), None).await
81        }
82    }
83
84    /// Allocate the data cluster with the given index.
85    ///
86    /// Without a `mandatory_host_cluster` given, this is the same as
87    /// [`Qcow2::allocate_data_cluster()`].
88    ///
89    /// With a `mandatory_host_cluster` given, try to allocate that cluster.  If that is not
90    /// possible because it is already allocated, return `Ok(None)`.
91    pub(super) async fn allocate_data_cluster_at(
92        &self,
93        guest_cluster: GuestCluster,
94        mandatory_host_cluster: Option<HostCluster>,
95    ) -> io::Result<Option<HostCluster>> {
96        let Some(mandatory_host_cluster) = mandatory_host_cluster else {
97            return self.allocate_data_cluster(guest_cluster).await.map(Some);
98        };
99
100        if self.header.external_data_file() {
101            let cluster = HostCluster(guest_cluster.0);
102            Ok((cluster == mandatory_host_cluster).then_some(cluster))
103        } else {
104            let mut allocator = self.allocator().await?;
105
106            // Allocate clusters before setting up L2 entries
107            self.l2_cache.depend_on(&allocator.rb_cache).await?;
108
109            let cluster = allocator
110                .allocate_cluster_at(mandatory_host_cluster)
111                .await?
112                .then_some(mandatory_host_cluster);
113            Ok(cluster)
114        }
115    }
116
117    /// Free metadata clusters (i.e. decrement their refcount).
118    ///
119    /// Best-effort operation.  On error, the given clusters may be leaked, but no errors are ever
120    /// returned (because there is no good way to handle such errors anyway).
121    pub(super) async fn free_meta_clusters(&self, cluster: HostCluster, count: ClusterCount) {
122        if let Ok(mut allocator) = self.allocator().await {
123            allocator.free_clusters(cluster, count).await
124        }
125    }
126
127    /// Free data clusters (i.e. decrement their refcount).
128    ///
129    /// Best-effort operation.  On error, the given clusters may be leaked, but no errors are ever
130    /// returned (because there is no good way to handle such errors anyway).
131    pub(super) async fn free_data_clusters(&self, cluster: HostCluster, count: ClusterCount) {
132        if !self.header.external_data_file() {
133            if let Ok(mut allocator) = self.allocator().await {
134                // Clear L2 entries before deallocating clusters
135                if let Err(err) = allocator.rb_cache.depend_on(&self.l2_cache).await {
136                    warn!("Leaking clusters; cannot set up cache inter-dependency with L2 cache: {err}");
137                    return;
138                }
139
140                allocator.free_clusters(cluster, count).await;
141            }
142        }
143    }
144}
145
146impl<S: Storage> Allocator<S> {
147    /// Create a new allocator for the given image file.
148    pub async fn new(image: Arc<S>, header: Arc<Header>) -> io::Result<Self> {
149        let cb = header.cluster_bits();
150        let rt_offset = header.reftable_offset();
151        let rt_cluster = rt_offset
152            .checked_cluster(cb)
153            .ok_or_else(|| invalid_data(format!("Unaligned refcount table: {rt_offset}")))?;
154
155        let reftable = RefTable::load(
156            image.as_ref(),
157            &header,
158            rt_cluster,
159            header.reftable_entries(),
160        )
161        .await?;
162
163        let rb_cache_backend = RefBlockCacheBackend::new(Arc::clone(&image), Arc::clone(&header));
164        let rb_cache = AsyncLruCache::new(rb_cache_backend, 32);
165
166        Ok(Allocator {
167            file: image,
168            reftable,
169            first_free_cluster: HostCluster(0),
170            header,
171            rb_cache,
172        })
173    }
174
175    /// Flush the refcount block cache.
176    pub async fn flush_rb_cache(&self) -> io::Result<()> {
177        self.rb_cache.flush().await
178    }
179
180    /// Invaidate the refcount block cache.
181    ///
182    /// # Safety
183    /// May cause image corruption, you must guarantee the on-disk state is consistent.
184    pub async unsafe fn invalidate_rb_cache(&self) -> io::Result<()> {
185        // Safe: Caller says so.
186        unsafe { self.rb_cache.invalidate() }.await
187    }
188
189    /// Allocate clusters in the image file.
190    ///
191    /// `end_cluster` should only be used when allocating refblocks.  When reaching this cluster
192    /// index, abort trying to allocate.  (This is used for allocating refblocks, to prevent
193    /// infinite recursion and speed things up.)
194    async fn allocate_clusters(
195        &mut self,
196        count: ClusterCount,
197        end_cluster: Option<HostCluster>,
198    ) -> io::Result<HostCluster> {
199        let mut index = self.first_free_cluster;
200        loop {
201            if end_cluster == Some(index) {
202                return Err(io::Error::other("Maximum cluster index reached"));
203            }
204
205            let alloc_count = self.allocate_clusters_at(index, count).await?;
206            if alloc_count == count {
207                return Ok(index);
208            }
209
210            index += alloc_count + ClusterCount(1);
211            if index.offset(self.header.cluster_bits()) > MAX_OFFSET {
212                return Err(io::Error::other("Cannot grow qcow2 file any further"));
213            }
214        }
215    }
216
217    /// Allocate the given clusters in the image file.
218    ///
219    /// Allocate up to `count` unallocated clusters starting from `index`.  When encountering an
220    /// already allocated cluster (or any other error), stop, and free the clusters that were just
221    /// newly allocated.
222    ///
223    /// Returns the number of clusters that could be allocated (starting from `index`), which may
224    /// be 0 if `index` has already been allocated.  Note again that in case this is less than
225    /// `count`, those clusters will have been freed again already, so this is just a hint to
226    /// callers that the cluster at `index + count` is already allocated.
227    async fn allocate_clusters_at(
228        &mut self,
229        mut index: HostCluster,
230        mut count: ClusterCount,
231    ) -> io::Result<ClusterCount> {
232        let start_index = index;
233
234        while count > ClusterCount(0) {
235            // Note that `ensure_rb()` in `allocate_cluster_at()` may allocate clusters (new
236            // refblocks), and also a new refcount table.  This can interfere with us allocating a
237            // large continuous region like so (A is our allocation, R is a refblock, imagine a
238            // refblock covers four clusters):
239            //
240            // |AAAA| -- allocated four clusters need new refblock
241            // |AAAA|R   | -- made refblock self-describing, but now allocation cannot go on
242            //
243            // This gets resolved by us retrying, and future refblocks using the region that has
244            // now become free but already has refblocks to cover it:
245            //
246            // |    |RAAA| -- retry after refblock; need a new refblock again
247            // |R   |RAAA|AAAA| -- the new refblock allocates itself in the region we abandoned
248            //
249            // However, eventually, the new refblocks will run into the new start of our allocation
250            // again:
251            //
252            // |RRRR|RAAA|AAAA|AAAA|AAAA|AAAA| -- need new refblock
253            // |RRRR|RAAA|AAAA|AAAA|AAAA|AAAA|R   | -- allocation cannot go on, again
254            // |RRRR|R   |    |    |    |    |RAAA| -- another attempt
255            // |RRRR|RRRR|R...|    |    |    |RAAA|AAAA|AAAA|AAAA|AAAA|...
256            //
257            // As you can see, the hole we leave behind gets larger each time.  So eventually, this
258            // must converge.
259            //
260            // The same applies to the refcount table being allocated instead of just refblocks.
261
262            let result = self.allocate_cluster_at(index).await;
263            if !matches!(result, Ok(true)) {
264                // Already allocated, or some real error occurred; free everything allocated so far
265                self.free_clusters(start_index, index - start_index).await;
266                return result.map(|_| index - start_index);
267            }
268
269            count -= ClusterCount(1);
270            index += ClusterCount(1);
271        }
272
273        Ok(index - start_index)
274    }
275
276    /// Allocate the given cluster in the image file.
277    ///
278    /// Return `Ok(true)` if allocation was successful, or `Ok(false)` if the cluster was already
279    /// allocated before.
280    async fn allocate_cluster_at(&mut self, index: HostCluster) -> io::Result<bool> {
281        let rb_bits = self.header.rb_bits();
282        let (rt_index, rb_index) = index.rt_rb_indices(rb_bits);
283
284        let rb = self.ensure_rb(rt_index).await?;
285        let mut rb = rb.lock_write().await;
286        let can_allocate = rb.is_zero(rb_index);
287        if can_allocate {
288            rb.increment(rb_index)?;
289        }
290
291        // We now know this is allocated
292        if index == self.first_free_cluster {
293            self.first_free_cluster = index + ClusterCount(1);
294        }
295
296        Ok(can_allocate)
297    }
298
299    /// Get the refblock referenced by the given reftable index, if any.
300    ///
301    /// If there is no refblock for the given reftable index, return `Ok(None)`.
302    async fn get_rb(&mut self, rt_index: usize) -> io::Result<Option<Arc<RefBlock>>> {
303        let rt_entry = self.reftable.get(rt_index);
304        if let Some(rb_offset) = rt_entry.refblock_offset() {
305            let cb = self.header.cluster_bits();
306            let rb_cluster = rb_offset.checked_cluster(cb).ok_or_else(|| {
307                invalid_data(format!("Unaligned refcount block with index {rt_index}; refcount table entry: {rt_entry:?}"))
308            })?;
309
310            self.rb_cache.get_or_insert(rb_cluster).await.map(Some)
311        } else {
312            Ok(None)
313        }
314    }
315
316    /// Get a refblock for the given reftable index.
317    ///
318    /// If there already is a refblock at that index, return it.  Otherwise, create one and hook it
319    /// up.
320    async fn ensure_rb(&mut self, rt_index: usize) -> io::Result<Arc<RefBlock>> {
321        if let Some(rb) = self.get_rb(rt_index).await? {
322            return Ok(rb);
323        }
324
325        if !self.reftable.in_bounds(rt_index) {
326            self.grow_reftable(rt_index).await?;
327            // `grow_reftable` will allocate new refblocks, so check the index again
328            if let Some(rb) = self.get_rb(rt_index).await? {
329                return Ok(rb);
330            }
331        }
332
333        let mut new_rb = RefBlock::new_cleared(self.file.as_ref(), &self.header)?;
334
335        // This is the first cluster covered by the new refblock
336        let rb_cluster = HostCluster::from_ref_indices(rt_index, 0, self.header.rb_bits());
337
338        // Try to allocate a cluster in the already existing refcount structures.
339        // By stopping looking for clusters at `rb_cluster`, we ensure that we will not land here
340        // in this exact function again, trying to allocate the very same refblock (it is possible
341        // we allocate one before the current one, though), and so prevent any possible infinite
342        // recursion.
343        // Recursion is possible, though, so the future must be boxed.
344        // false`), so must be boxed.
345        if let Ok(new_rb_cluster) =
346            Box::pin(self.allocate_clusters(ClusterCount(1), Some(rb_cluster))).await
347        {
348            new_rb.set_cluster(new_rb_cluster);
349        } else {
350            // Place the refblock such that it covers itself
351            new_rb.set_cluster(rb_cluster);
352            new_rb.lock_write().await.increment(0)?;
353        }
354        new_rb.write(self.file.as_ref()).await?;
355
356        self.reftable.enter_refblock(rt_index, &new_rb)?;
357        self.reftable
358            .write_entry(self.file.as_ref(), rt_index)
359            .await?;
360
361        let new_rb = Arc::new(new_rb);
362        self.rb_cache
363            .insert(new_rb.get_cluster().unwrap(), Arc::clone(&new_rb))
364            .await?;
365        Ok(new_rb)
366    }
367
368    /// Create a new refcount table covering at least `at_least_index`.
369    ///
370    /// Create a new reftable of the required size, copy all existing refblock references into it,
371    /// ensure it is refcounted itself (also creating new refblocks if necessary), and have the
372    /// image header reference the new refcount table.
373    async fn grow_reftable(&mut self, at_least_index: usize) -> io::Result<()> {
374        let cb = self.header.cluster_bits();
375        let rb_bits = self.header.rb_bits();
376        let rb_entries = 1 << rb_bits;
377
378        let mut new_rt = self.reftable.clone_and_grow(&self.header, at_least_index)?;
379        let rt_clusters = ClusterCount::from_byte_size(new_rt.byte_size() as u64, cb);
380
381        // Find free range
382        let (mut rt_index, mut rb_index) = self.first_free_cluster.rt_rb_indices(rb_bits);
383        let mut free_cluster_index: Option<HostCluster> = None;
384        let mut free_cluster_count = ClusterCount(0);
385
386        // Number of clusters required to allocate both the new reftable and all new refblocks.
387        // Note that `clone_and_grow()` *guarantees* we can fit the final count in there.
388        let mut required_clusters = rt_clusters;
389
390        while free_cluster_count < required_clusters {
391            // `clone_and_grow()` guarantees it can fit
392            assert!(new_rt.in_bounds(rt_index));
393
394            let rt_entry = new_rt.get(rt_index);
395            let Some(rb_offset) = rt_entry.refblock_offset() else {
396                let start_index = HostCluster::from_ref_indices(rt_index, 0, rb_bits);
397                free_cluster_index.get_or_insert(start_index);
398                free_cluster_count += ClusterCount(rb_entries as u64);
399                // Need to allocate this RB
400                required_clusters += ClusterCount(1);
401                continue;
402            };
403
404            let rb_cluster = rb_offset.checked_cluster(cb).ok_or_else(|| {
405                invalid_data(format!("Unaligned refcount block with index {rt_index}; refcount table entry: {rt_entry:?}"))
406            })?;
407
408            let rb = self.rb_cache.get_or_insert(rb_cluster).await?;
409            for i in rb_index..rb_entries {
410                if rb.is_zero(i) {
411                    let index = HostCluster::from_ref_indices(rt_index, i, rb_bits);
412                    free_cluster_index.get_or_insert(index);
413                    free_cluster_count += ClusterCount(1);
414
415                    if free_cluster_count >= required_clusters {
416                        break;
417                    }
418                } else if free_cluster_index.is_some() {
419                    free_cluster_index.take();
420                    free_cluster_count = ClusterCount(0);
421                    required_clusters = rt_clusters; // reset
422                }
423            }
424
425            rb_index = 0;
426            rt_index += 1;
427        }
428
429        let mut index = free_cluster_index.unwrap();
430        let mut count = required_clusters;
431
432        // Put refblocks first
433        let rt_index_start = index.rt_index(rb_bits);
434        let rt_index_end = (index + count).0.div_ceil(rb_entries as u64) as usize;
435
436        let mut refblocks = Vec::<Arc<RefBlock>>::new();
437        for rt_i in rt_index_start..rt_index_end {
438            if let Some(rb_offset) = new_rt.get(rt_i).refblock_offset() {
439                // Checked in the loop above
440                let rb_cluster = rb_offset.checked_cluster(cb).unwrap();
441                let rb = self.rb_cache.get_or_insert(rb_cluster).await?;
442                refblocks.push(rb);
443                continue;
444            }
445
446            let mut rb = RefBlock::new_cleared(self.file.as_ref(), &self.header)?;
447            rb.set_cluster(index);
448            new_rt.enter_refblock(rt_i, &rb)?;
449            let rb = Arc::new(rb);
450            self.rb_cache.insert(index, Arc::clone(&rb)).await?;
451            refblocks.push(rb);
452            index += ClusterCount(1);
453            count -= ClusterCount(1);
454        }
455
456        assert!(count >= rt_clusters);
457        new_rt.set_cluster(index);
458
459        // Now set allocation information
460        let start_index = free_cluster_index.unwrap();
461        let end_index = index + rt_clusters;
462
463        for index in start_index.0..end_index.0 {
464            let index = HostCluster(index);
465            let (rt_i, rb_i) = index.rt_rb_indices(rb_bits);
466
467            // `refblocks[0]` is for `rt_index_start`
468            let rb_vec_i = rt_i - rt_index_start;
469            // Incrementing from 0 to 1 must succeed
470            refblocks[rb_vec_i]
471                .lock_write()
472                .await
473                .increment(rb_i)
474                .unwrap();
475        }
476
477        // Any errors from here on may lead to leaked clusters if there are refblocks in
478        // `refblocks` that are already part of the old reftable.
479        // TODO: Try to clean that up, though it seems quite hard for little gain.
480        self.rb_cache.flush().await?;
481        new_rt.write(self.file.as_ref()).await?;
482
483        self.header.set_reftable(&new_rt)?;
484        self.header
485            .write_reftable_pointer(self.file.as_ref())
486            .await?;
487
488        // Must set new reftable before calling `free_clusters()`
489        let mut old_reftable = mem::replace(&mut self.reftable, new_rt);
490        if let Some(old_rt_cluster) = old_reftable.get_cluster() {
491            let old_rt_size = old_reftable.cluster_count();
492            old_reftable.unset_cluster();
493            self.free_clusters(old_rt_cluster, old_rt_size).await;
494        }
495
496        Ok(())
497    }
498
499    /// Free clusters (i.e. decrement their refcount).
500    ///
501    /// Best-effort operation.  On error, the given clusters may be leaked, but no errors are ever
502    /// returned (because there is no good way to handle such errors anyway).
503    async fn free_clusters(&mut self, start: HostCluster, mut count: ClusterCount) {
504        if count.0 == 0 {
505            return;
506        }
507
508        if start < self.first_free_cluster {
509            self.first_free_cluster = start;
510        }
511
512        let rb_bits = self.header.rb_bits();
513        let rb_entries = 1 << rb_bits;
514        let (mut rt_index, mut rb_index) = start.rt_rb_indices(rb_bits);
515
516        while count > ClusterCount(0) {
517            let in_rb_count = cmp::min((rb_entries - rb_index) as u64, count.0) as usize;
518
519            match self.get_rb(rt_index).await {
520                Ok(Some(rb)) => {
521                    let mut rb = rb.lock_write().await;
522                    for i in rb_index..(rb_index + in_rb_count) {
523                        if let Err(err) = rb.decrement(i) {
524                            event!(Level::WARN, "Failed to free cluster: {err}");
525                        }
526                    }
527                }
528
529                Ok(None) => {
530                    event!(
531                        Level::WARN,
532                        "Failed to free {in_rb_count} clusters: Not allocated"
533                    )
534                }
535                Err(err) => event!(Level::WARN, "Failed to free {in_rb_count} clusters: {err}"),
536            }
537
538            count -= ClusterCount(in_rb_count as u64);
539            rb_index = 0;
540            rt_index += 1;
541        }
542    }
543}