Generating Small, Medium, and Large Datasets

Overview

This vignette demonstrates how to use the {samplezoo} package to generate datasets of varying sizes (small, medium, and large) with variables from multiple probability distributions.

Each dataset contains:

  • Variables/columns from common distributions such as Normal, Binomial, Poisson, and others.

  • Adjustable sample sizes to meet needs.

library(samplezoo)

Generate a small dataset (i.e., 100 rows)

data_small <- samplezoo("small")
head(data_small)
#>       norm   norm_2    norm_3 bern neg pois       exp      unif       beta
#> 1 63.17078 66.45916 53.516804    1   0    4  6.481463 0.2410419 0.15525688
#> 2 35.45895 34.89860 34.605709    0   4    1 23.908570 0.3030301 0.04259387
#> 3 54.05800 72.73750  9.688985    1   0    4 14.056282 0.8598876 0.13860002
#> 4 53.16666 60.27601 47.394944    1   1    5  6.539186 0.1263046 0.63459452
#> 5 62.78092 59.92270 23.784479    0   0    5 33.971733 0.1820170 0.39191500
#> 6 17.86968 61.41629 27.855150    1   4    3  6.772410 0.5542087 0.19612948
#>      gamma    chi_sq      t_dist    f_dist
#> 1 5.960212 20.282286 -0.45809852 1.2033737
#> 2 1.221299  7.720472  0.67804593 0.4855198
#> 3 1.564677  6.296175 -0.02203252 0.8528776
#> 4 4.246349  8.944249  0.50670311 0.3362610
#> 5 5.091944  7.569064 -0.41179705 1.8880333
#> 6 4.362519 13.150804 -0.91484289 6.6996572

Generate a medium sized dataset (i.e., 1,000 rows)

data_medium <- samplezoo("medium")
head(data_medium)
#>       norm   norm_2   norm_3 bern neg pois          exp      unif       beta
#> 1 53.81610 62.52185 40.88840    1   2    1  1.347299418 0.7717186 0.17020415
#> 2 32.23838 69.00252 42.92283    1   0    3  7.564968284 0.1511976 0.04014913
#> 3 86.29126 75.02940 16.89684    0   1    2  5.029708603 0.2068348 0.26510409
#> 4 69.56563 43.08343 37.64668    1   0    4  0.005269265 0.8637200 0.03531230
#> 5 60.89404 62.91326 72.45100    1   1    2 16.713277119 0.1171863 0.38590862
#> 6 66.71723 47.28574 42.21524    0   1    2  5.709692985 0.7065846 0.27864365
#>        gamma    chi_sq     t_dist   f_dist
#> 1  1.2797782  7.563002  0.7040720 1.951610
#> 2  0.9491639 10.666864 -1.2038265 1.575964
#> 3  6.4444169 10.111492  0.2744420 1.406042
#> 4  0.9429589  3.229373 -1.5050957 1.224418
#> 5  7.4719117 12.628434 -0.2515078 2.970643
#> 6 14.4938132 10.112043  0.8163137 0.646780

Generate a large sized dataset (i.e., 10,000 rows)

data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2   norm_3 bern neg pois        exp       unif      beta
#> 1 69.06926 54.78800 42.33331    1   2    4 49.0787612 0.29438380 0.1906598
#> 2 65.79423 60.23637 24.51366    0   0    6 23.1443750 0.90527332 0.1019475
#> 3 30.29612 51.96121 49.13541    0   1    3  0.3826007 0.89257582 0.2344017
#> 4 25.24485 72.24773 38.74936    0   3    1  6.6637183 0.04771752 0.4608193
#> 5 47.84531 58.30289 69.35160    1   0    2  3.6652096 0.80507796 0.3263387
#> 6 69.95993 60.63180 28.14861    0   0    7  0.5670046 0.99324266 0.2790363
#>      gamma    chi_sq     t_dist    f_dist
#> 1 2.087312 20.078999  0.3693474 0.5656647
#> 2 5.972130 11.447609  0.9822478 1.0532589
#> 3 3.619075  8.040936  0.6652285 0.7495735
#> 4 5.122370 14.457524 -0.8462000 1.8472764
#> 5 4.062681 14.823928 -0.7654113 1.8105827
#> 6 1.107407  9.837824 -0.1848859 0.2006835

Adding Variation or Ensuring Reproducibility with set.seed()

To ensure reproducibility and introduce controlled variation in your dataset, use set.seed() before generating random data.

Reproducibility

set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2    norm_3 bern neg pois       exp      unif       beta
#> 1 41.59287 83.70725 23.274065    0   1    6  6.628373 0.5468223 0.08294255
#> 2 46.54734 58.33188 35.588540    1   0    5 21.305366 0.3900809 0.63544684
#> 3 73.38062 69.26961 -2.070295    1   2    4  0.189645 0.7262119 0.11520674
#> 4 51.05763 54.31848  6.643849    0   2    2  8.479098 0.5101462 0.38184206
#> 5 51.93932 62.25090 18.040743    0   0    2 11.885521 0.2964126 0.17196046
#> 6 75.72597 71.31986  6.687576    0   1    4  6.363993 0.1442317 0.35908460
#>       gamma    chi_sq     t_dist    f_dist
#> 1 6.9893762 10.286282 -0.3814568 0.7264343
#> 2 5.4087626  6.519658 -2.3409216 0.9698166
#> 3 1.2587867  8.011417 -0.4744159 0.4329175
#> 4 0.9871787 14.780626  0.4292511 1.0227474
#> 5 2.4021943  6.799788 -0.6692669 2.7446729
#> 6 4.2109032 17.858701 -0.3370763 1.3993853
set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2    norm_3 bern neg pois       exp      unif       beta
#> 1 41.59287 83.70725 23.274065    0   1    6  6.628373 0.5468223 0.08294255
#> 2 46.54734 58.33188 35.588540    1   0    5 21.305366 0.3900809 0.63544684
#> 3 73.38062 69.26961 -2.070295    1   2    4  0.189645 0.7262119 0.11520674
#> 4 51.05763 54.31848  6.643849    0   2    2  8.479098 0.5101462 0.38184206
#> 5 51.93932 62.25090 18.040743    0   0    2 11.885521 0.2964126 0.17196046
#> 6 75.72597 71.31986  6.687576    0   1    4  6.363993 0.1442317 0.35908460
#>       gamma    chi_sq     t_dist    f_dist
#> 1 6.9893762 10.286282 -0.3814568 0.7264343
#> 2 5.4087626  6.519658 -2.3409216 0.9698166
#> 3 1.2587867  8.011417 -0.4744159 0.4329175
#> 4 0.9871787 14.780626  0.4292511 1.0227474
#> 5 2.4021943  6.799788 -0.6692669 2.7446729
#> 6 4.2109032 17.858701 -0.3370763 1.3993853

Variation

set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2    norm_3 bern neg pois       exp      unif       beta
#> 1 41.59287 83.70725 23.274065    0   1    6  6.628373 0.5468223 0.08294255
#> 2 46.54734 58.33188 35.588540    1   0    5 21.305366 0.3900809 0.63544684
#> 3 73.38062 69.26961 -2.070295    1   2    4  0.189645 0.7262119 0.11520674
#> 4 51.05763 54.31848  6.643849    0   2    2  8.479098 0.5101462 0.38184206
#> 5 51.93932 62.25090 18.040743    0   0    2 11.885521 0.2964126 0.17196046
#> 6 75.72597 71.31986  6.687576    0   1    4  6.363993 0.1442317 0.35908460
#>       gamma    chi_sq     t_dist    f_dist
#> 1 6.9893762 10.286282 -0.3814568 0.7264343
#> 2 5.4087626  6.519658 -2.3409216 0.9698166
#> 3 1.2587867  8.011417 -0.4744159 0.4329175
#> 4 0.9871787 14.780626  0.4292511 1.0227474
#> 5 2.4021943  6.799788 -0.6692669 2.7446729
#> 6 4.2109032 17.858701 -0.3370763 1.3993853
set.seed(456)
data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2     norm_3 bern neg pois        exp      unif       beta
#> 1 29.84718 68.13494  7.9885694    0   0    5  3.4417303 0.8866347 0.05413307
#> 2 59.32663 52.32066 21.2526086    0   3    3  0.8114356 0.7976466 0.07195440
#> 3 62.01312 62.47569 38.4789563    0   2    6 46.8038907 0.6469920 0.22555129
#> 4 29.16661 53.51086 -0.8656269    0   1    5 11.6955326 0.2036753 0.71455809
#> 5 39.28465 47.19406 47.7819258    1   1    1  0.3535625 0.3653401 0.34619912
#> 6 45.13908 63.33566 53.3620528    1   1    2  4.5592136 0.7628573 0.25880522
#>       gamma    chi_sq     t_dist    f_dist
#> 1 6.7914120  4.464348 -1.0150596 2.2557295
#> 2 3.0132520  8.062120  0.3262369 1.4955877
#> 3 4.7360954 10.969593  1.5141157 1.0766901
#> 4 5.1235878  6.249247  0.6432708 1.1251542
#> 5 6.6851637  4.358815  0.2025742 0.4754946
#> 6 0.3903841 20.019575  1.6257109 0.6653886