Generating Small, Medium, and Large Datasets

Overview

This vignette demonstrates how to use the {samplezoo} package to generate datasets of varying sizes (small, medium, and large) with variables from multiple probability distributions.

Each dataset contains:

  • Variables/columns from common distributions such as Normal, Binomial, Poisson, and others.

  • Adjustable sample sizes to meet needs.

library(samplezoo)

Generate a small dataset (i.e., 100 rows)

data_small <- samplezoo("small")
head(data_small)
#>       norm    norm2    norm3 binom neg pois       exp      unif       beta
#> 1 24.01904 57.10210 33.58085     0   5    5  8.109759 0.2746459 0.06072892
#> 2 52.61473 77.85494 52.61809     1   1    2  5.791296 0.2964852 0.58915675
#> 3 22.85430 50.32991 36.39559     0   3    5 21.151451 0.5230089 0.23070819
#> 4 37.74089 40.54657 50.42342     1   2    5  5.652803 0.1250447 0.34704880
#> 5 57.38885 78.49564 19.06029     0   0    5 10.482339 0.8279693 0.07115475
#> 6 40.53354 80.83017 65.38507     0   0    1  4.198844 0.4381156 0.15960607
#>      gamma     chisq     t_dist
#> 1 2.543362 0.3497569  0.4721417
#> 2 5.347351 0.8319221  1.6167839
#> 3 4.450899 1.8383299  0.3548561
#> 4 2.412426 2.3599334 -0.1526986
#> 5 1.060736 0.8434991  0.5108567
#> 6 2.244033 2.2491525  0.4892787

Generate a medium sized dataset (i.e., 1,000 rows)

data_medium <- samplezoo("medium")
head(data_medium)
#>       norm    norm2     norm3 binom neg pois        exp       unif       beta
#> 1 24.24724 73.29708 22.214310     0   1    5 11.0050841 0.65784701 0.09821555
#> 2 50.45329 53.61844 47.973893     0   2    3  4.9172695 0.27143713 0.06785489
#> 3 35.32228 59.49156 33.161308     0   1    3  5.2108455 0.00046993 0.64303826
#> 4 75.23637 62.10179 60.370066     0   0    5 15.5838949 0.99051073 0.12231201
#> 5 62.44062 63.27317 41.763082     0   1    8  2.5608562 0.69043010 0.31760048
#> 6 44.17461 46.24991  6.367756     0   0    4  0.1999096 0.38624775 0.44643234
#>      gamma     chisq     t_dist
#> 1 1.987381  3.833893  0.7609412
#> 2 9.848751  6.164145 -0.5770737
#> 3 5.344355  3.140235  1.9390034
#> 4 0.941446  8.962906 -0.5503737
#> 5 1.299360  5.785310  0.3709101
#> 6 1.079053 10.883484 -0.6275835

Generate a large sized dataset (i.e., 10,000 rows)

data_large <- samplezoo("large")
head(data_large)
#>       norm    norm2    norm3 binom neg pois       exp       unif      beta
#> 1 50.95933 62.03111 33.57413     0   3    4 17.572730 0.71401790 0.4471074
#> 2 49.08107 67.50409 14.55490     0   0    5  1.043247 0.78324006 0.2566909
#> 3 44.92104 61.85103 46.24320     0   1    4  8.063268 0.03044348 0.2062338
#> 4 53.07700 59.65934 30.80349     1   1    0  9.497468 0.56686292 0.2061777
#> 5 60.49422 50.05604 65.94409     0   0    3 23.765609 0.64508146 0.3071335
#> 6 48.76375 53.25095 28.84037     0   1    4 11.256821 0.98811979 0.1356730
#>      gamma     chisq      t_dist
#> 1 4.754528 13.016135  0.61540893
#> 2 3.528773 12.118134 -0.03291589
#> 3 4.090767 15.699932  0.66201871
#> 4 6.468221  6.432281 -0.51771387
#> 5 1.070185  7.569166  0.68421285
#> 6 2.163831 11.673227 -1.77789271

Adding Variation or Ensuring Reproducibility with set.seed()

To ensure reproducibility and introduce controlled variation in your dataset, use set.seed() before generating random data.

set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#>       norm    norm2     norm3 binom neg pois       exp      unif       beta
#> 1 41.59287 83.70725 23.274065     0   1    6  6.628373 0.5468223 0.08294255
#> 2 46.54734 58.33188 35.588540     0   0    5 21.305366 0.3900809 0.63544684
#> 3 73.38062 69.26961 -2.070295     0   2    4  0.189645 0.7262119 0.11520674
#> 4 51.05763 54.31848  6.643849     0   2    2  8.479098 0.5101462 0.38184206
#> 5 51.93932 62.25090 18.040743     0   0    2 11.885521 0.2964126 0.17196046
#> 6 75.72597 71.31986  6.687576     0   1    4  6.363993 0.1442317 0.35908460
#>       gamma     chisq     t_dist
#> 1 6.9893762 10.286282 -0.3814568
#> 2 5.4087626  6.519658 -2.3409216
#> 3 1.2587867  8.011417 -0.4744159
#> 4 0.9871787 14.780626  0.4292511
#> 5 2.4021943  6.799788 -0.6692669
#> 6 4.2109032 17.858701 -0.3370763
set.seed(456)
data_large <- samplezoo("large")
head(data_large)
#>       norm    norm2      norm3 binom neg pois        exp      unif       beta
#> 1 29.84718 68.13494  7.9885694     0   0    5  3.4417303 0.8866347 0.05413307
#> 2 59.32663 52.32066 21.2526086     0   3    3  0.8114356 0.7976466 0.07195440
#> 3 62.01312 62.47569 38.4789563     0   2    6 46.8038907 0.6469920 0.22555129
#> 4 29.16661 53.51086 -0.8656269     0   1    5 11.6955326 0.2036753 0.71455809
#> 5 39.28465 47.19406 47.7819258     1   1    1  0.3535625 0.3653401 0.34619912
#> 6 45.13908 63.33566 53.3620528     1   1    2  4.5592136 0.7628573 0.25880522
#>       gamma     chisq     t_dist
#> 1 6.7914120  4.464348 -1.0150596
#> 2 3.0132520  8.062120  0.3262369
#> 3 4.7360954 10.969593  1.5141157
#> 4 5.1235878  6.249247  0.6432708
#> 5 6.6851637  4.358815  0.2025742
#> 6 0.3903841 20.019575  1.6257109