Patchwork D6257: rust-discovery: core implementation for take_quick_sample()

login
register
mail settings
Submitter phabricator
Date April 16, 2019, 9:51 p.m.
Message ID <differential-rev-PHID-DREV-2kmcy2fevy6vh4qsv2jk-req@phab.mercurial-scm.org>
Download mbox | patch
Permalink /patch/39668/
State Superseded
Headers show

Comments

phabricator - April 16, 2019, 9:51 p.m.
gracinet created this revision.
Herald added subscribers: mercurial-devel, kevincox, durin42.
Herald added a reviewer: hg-reviewers.

REVISION SUMMARY
  This makes in particular `rand` no longer a testing dependency.
  
  We keep a seedable random generator on the `PartialDiscovery` object
  itself, to avoid lengthy initialization.
  
  In take_quick_sample() itself, we had to avoid keeping the reference
  to `self.undecided` to cope with the mutable reference introduced
  by the the call to `limit_sample`, but it's still manageable without
  resorting to inner mutability.
  
  Sampling being prone to be improved in the mid-term future, testing
  is minimal, amounting to checking which code path got executed.

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D6257

AFFECTED FILES
  rust/hg-core/Cargo.toml
  rust/hg-core/src/discovery.rs

CHANGE DETAILS




To: gracinet, #hg-reviewers
Cc: durin42, kevincox, mercurial-devel
phabricator - April 20, 2019, 6:52 p.m.
kevincox accepted this revision.
kevincox added inline comments.

INLINE COMMENTS

> discovery.rs:70
> +        };
> +        if seen.contains(&current) {
> +            continue;

You can do

  if see.insert(current).is_some() {
    continue; // Already present.
  }

And avoid the insert below. Alternatively use the entry API which may be a bit more readable.

> discovery.rs:87
> +        seen.insert(current);
> +        for p in parentsfn(current)?.iter().cloned() {
> +            if p == NULL_REVISION {

You can pattern match away the reference since `Revision` is `Copy`.

  for &p in parentsfn(curent)? {
    // ...
  }

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D6257

To: gracinet, #hg-reviewers, kevincox
Cc: durin42, kevincox, mercurial-devel
phabricator - May 6, 2019, 3:32 p.m.
gracinet added a comment.


  @kevincox thanks for the initial review, I think I've implemented the corrections you suggested.

INLINE COMMENTS

> kevincox wrote in discovery.rs:70
> You can do
> 
>   if see.insert(current).is_some() {
>     continue; // Already present.
>   }
> 
> And avoid the insert below. Alternatively use the entry API which may be a bit more readable.

Ah yes that's right, let this one through.

Since `seen` is not used in the loop after that check (there's an early return that doesn't matter in this case), I found the boolean return of `HashSet::insert()` does the job neatly.

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D6257

To: gracinet, #hg-reviewers, kevincox
Cc: durin42, kevincox, mercurial-devel

Patch

diff --git a/rust/hg-core/src/discovery.rs b/rust/hg-core/src/discovery.rs
--- a/rust/hg-core/src/discovery.rs
+++ b/rust/hg-core/src/discovery.rs
@@ -10,23 +10,96 @@ 
 //! This is a Rust counterpart to the `partialdiscovery` class of
 //! `mercurial.setdiscovery`
 
-use super::{Graph, GraphError, Revision};
+extern crate rand;
+extern crate rand_pcg;
+use self::rand::seq::SliceRandom;
+use self::rand::{thread_rng, RngCore, SeedableRng};
+use super::{Graph, GraphError, Revision, NULL_REVISION};
 use crate::ancestors::MissingAncestors;
 use crate::dagops;
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet, VecDeque};
+
+type Rng = self::rand_pcg::Pcg32;
 
 pub struct PartialDiscovery<G: Graph + Clone> {
     target_heads: Option<Vec<Revision>>,
     graph: G, // plays the role of self._repo
     common: MissingAncestors<G>,
     undecided: Option<HashSet<Revision>>,
     missing: HashSet<Revision>,
+    rng: Rng,
 }
 
 pub struct DiscoveryStats {
     pub undecided: Option<usize>,
 }
 
+/// Update an existing sample to match the expected size
+///
+/// The sample is updated with revisions exponentially distant from each
+/// element of `heads`.
+///
+/// If a target size is specified, the sampling will stop once this size is
+/// reached. Otherwise sampling will happen until roots of the <revs> set are
+/// reached.
+///
+/// - `revs`: set of revs we want to discover (if None, `assume` the whole dag
+///   represented by `parentfn`
+/// - `heads`: set of DAG head revs
+/// - `sample`: a sample to update
+/// - `parentfn`: a callable to resolve parents for a revision
+/// - `quicksamplesize`: optional target size of the sample
+fn update_sample(
+    revs: Option<&HashSet<Revision>>,
+    heads: impl IntoIterator<Item = Revision>,
+    sample: &mut HashSet<Revision>,
+    parentsfn: impl Fn(Revision) -> Result<[Revision; 2], GraphError>,
+    quicksamplesize: Option<usize>,
+) -> Result<(), GraphError> {
+    let mut distances: HashMap<Revision, u32> = HashMap::new();
+    let mut visit: VecDeque<Revision> = heads.into_iter().collect();
+    let mut factor: u32 = 1;
+    let mut seen: HashSet<Revision> = HashSet::new();
+    loop {
+        let current = match visit.pop_front() {
+            None => {
+                break;
+            }
+            Some(r) => r,
+        };
+        if seen.contains(&current) {
+            continue;
+        }
+
+        let d = *distances.entry(current).or_insert(1);
+        if d > factor {
+            factor *= 2;
+        }
+        if d == factor {
+            sample.insert(current);
+            if let Some(sz) = quicksamplesize {
+                if sample.len() >= sz {
+                    return Ok(());
+                }
+            }
+        }
+        seen.insert(current);
+        for p in parentsfn(current)?.iter().cloned() {
+            if p == NULL_REVISION {
+                continue;
+            }
+            if let Some(revs) = revs {
+                if !revs.contains(&p) {
+                    continue;
+                }
+            }
+            distances.entry(p).or_insert(d + 1);
+            visit.push_back(p);
+        }
+    }
+    Ok(())
+}
+
 impl<G: Graph + Clone> PartialDiscovery<G> {
     /// Create a PartialDiscovery object, with the intent
     /// of comparing our `::<target_heads>` revset to the contents of another
@@ -39,15 +112,47 @@ 
     /// we'll have to make it a type argument of `PartialDiscovery` or a trait
     /// object since we'll keep it in the meanwhile
     pub fn new(graph: G, target_heads: Vec<Revision>) -> Self {
+        let mut seed: [u8; 16] = [0; 16];
+        thread_rng().fill_bytes(&mut seed);
+        Self::new_with_seed(graph, target_heads, seed)
+    }
+
+    pub fn new_with_seed(
+        graph: G,
+        target_heads: Vec<Revision>,
+        seed: [u8; 16],
+    ) -> Self {
         PartialDiscovery {
             undecided: None,
             target_heads: Some(target_heads),
             graph: graph.clone(),
             common: MissingAncestors::new(graph, vec![]),
             missing: HashSet::new(),
+            rng: Rng::from_seed(seed),
         }
     }
 
+    /// Extract at most `size` random elements from sample and return them
+    /// as a vector
+    fn limit_sample(
+        &mut self,
+        mut sample: Vec<Revision>,
+        size: usize,
+    ) -> Vec<Revision> {
+        let sample_len = sample.len();
+        if sample_len < size {
+            return sample;
+        }
+        let rng = &mut self.rng;
+        let dropped_size = sample_len - size;
+        let limited_slice = if size < dropped_size {
+            sample.partial_shuffle(rng, size).0
+        } else {
+            sample.partial_shuffle(rng, dropped_size).1
+        };
+        limited_slice.to_owned()
+    }
+
     /// Register revisions known as being common
     pub fn add_common_revisions(
         &mut self,
@@ -123,16 +228,55 @@ 
             undecided: self.undecided.as_ref().map(|s| s.len()),
         }
     }
+
+    pub fn take_quick_sample(
+        &mut self,
+        headrevs: impl IntoIterator<Item = Revision>,
+        size: usize,
+    ) -> Result<Vec<Revision>, GraphError> {
+        self.ensure_undecided()?;
+        let mut sample = {
+            let undecided = self.undecided.as_ref().unwrap();
+            if undecided.len() <= size {
+                return Ok(undecided.iter().cloned().collect());
+            }
+            dagops::heads(&self.graph, undecided.iter())?
+        };
+        if sample.len() >= size {
+            return Ok(self.limit_sample(sample.into_iter().collect(), size));
+        }
+        update_sample(
+            None,
+            headrevs,
+            &mut sample,
+            |r| self.graph.parents(r),
+            Some(size),
+        )?;
+        Ok(sample.into_iter().collect())
+    }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
     use crate::testing::SampleGraph;
 
     /// A PartialDiscovery as for pushing all the heads of `SampleGraph`
+    ///
+    /// To avoid actual randomness in tests, we give it a fixed random seed.
     fn full_disco() -> PartialDiscovery<SampleGraph> {
-        PartialDiscovery::new(SampleGraph, vec![10, 11, 12, 13])
+        PartialDiscovery::new_with_seed(
+            SampleGraph,
+            vec![10, 11, 12, 13],
+            [0; 16],
+        )
+    }
+
+    /// A PartialDiscovery as for pushing the 12 head of `SampleGraph`
+    ///
+    /// To avoid actual randomness in tests, we give it a fixed random seed.
+    fn disco12() -> PartialDiscovery<SampleGraph> {
+        PartialDiscovery::new_with_seed(SampleGraph, vec![12], [0; 16])
     }
 
     fn sorted_undecided(
@@ -199,4 +343,45 @@ 
         assert_eq!(sorted_common_heads(&disco)?, vec![5, 11, 12]);
         Ok(())
     }
+
+    #[test]
+    fn test_limit_sample_no_need_to() {
+        let sample = vec![1, 2, 3, 4];
+        assert_eq!(full_disco().limit_sample(sample, 10), vec![1, 2, 3, 4]);
+    }
+
+    #[test]
+    fn test_limit_sample_less_than_half() {
+        assert_eq!(full_disco().limit_sample((1..6).collect(), 2), vec![4, 2]);
+    }
+
+    #[test]
+    fn test_limit_sample_more_than_half() {
+        assert_eq!(full_disco().limit_sample((1..4).collect(), 2), vec![3, 2]);
+    }
+
+    #[test]
+    fn test_quick_sample_enough_undecided_heads() -> Result<(), GraphError> {
+        let mut disco = full_disco();
+        disco.undecided = Some((1..=13).collect());
+
+        let mut sample_vec = disco.take_quick_sample(vec![], 4)?;
+        sample_vec.sort();
+        assert_eq!(sample_vec, vec![10, 11, 12, 13]);
+        Ok(())
+    }
+
+    #[test]
+    fn test_quick_sample_climbing_from_12() -> Result<(), GraphError> {
+        let mut disco = disco12();
+        disco.ensure_undecided()?;
+
+        let mut sample_vec = disco.take_quick_sample(vec![12], 4)?;
+        sample_vec.sort();
+        // r12's only parent is r9, whose unique grand-parent through the
+        // diamond shape is r4. This ends there because the distance from r4
+        // to the root is only 3.
+        assert_eq!(sample_vec, vec![4, 9, 12]);
+        Ok(())
+    }
 }
diff --git a/rust/hg-core/Cargo.toml b/rust/hg-core/Cargo.toml
--- a/rust/hg-core/Cargo.toml
+++ b/rust/hg-core/Cargo.toml
@@ -7,6 +7,6 @@ 
 [lib]
 name = "hg"
 
-[dev-dependencies]
-rand = "*"
-rand_pcg = "*"
+[dependencies]
+rand = "> 0.6.4"
+rand_pcg = "> 0.1.0"