Basic usage of the package.

Basic usage

First, let’s create 5 clusters normally distributed around 1 to 5, with sd of 0.3:

data <- simulate_data(n=100, sd=0.3, nclust=5, dims=2)
data
## # A tibble: 500 × 4
##       id    V1    V2 true_clust
##    <int> <dbl> <dbl>      <int>
##  1     1 1.31  0.806          1
##  2     2 1.47  1.33           1
##  3     3 1.37  0.762          1
##  4     4 1.12  1.23           1
##  5     5 0.827 1.26           1
##  6     6 0.984 1.26           1
##  7     7 0.565 1.27           1
##  8     8 1.15  1.11           1
##  9     9 1.64  0.716          1
## 10    10 1.05  1.57           1
## # … with 490 more rows

This is how our data looks like:

data %>% ggplot(aes(x=V1, y=V2, color=factor(true_clust))) + 
    geom_point() + 
    scale_color_discrete(name='true cluster')

Now we can cluster it using kmeans++:

data_for_clust <- data %>% select(id, starts_with('V'))
km <- TGL_kmeans_tidy(data_for_clust,
              k=5, 
              metric='euclid', 
              verbose=TRUE)
## id column: id
## KMEans: will generate seeds
## KMeans into generate seeds
## at seed 0
## add new core from 349 to 0
## at seed 1
## done update min distance
## seed range 350 450
## picked up 21 dist was 2.08115
## add new core from 21 to 1
## at seed 2
## done update min distance
## seed range 300 400
## picked up 438 dist was 0.824157
## add new core from 438 to 2
## at seed 3
## done update min distance
## seed range 250 350
## picked up 248 dist was 0.717481
## add new core from 248 to 3
## at seed 4
## done update min distance
## seed range 200 300
## picked up 108 dist was 0.546334
## add new core from 108 to 4
## KMEans: reassign after init
## KMEans: iter 0
## KMEans: iter 1 changed 0

The returned list contains 3 fields:

names(km)
## [1] "centers" "cluster" "size"

km$centers contains a tibble with clust column and the cluster centers:

km$centers
## # A tibble: 5 × 3
##   clust    V1    V2
##   <int> <dbl> <dbl>
## 1     1  1.01  1.12
## 2     2  2.08  2.00
## 3     3  3.03  2.99
## 4     4  4.08  3.95
## 5     5  5.08  5.02

clusters are numbered according to order_func (see ‘Custom cluster ordering’ section).

km$cluster contains tibble with id column with the observation id (1:n if no id column was supplied), and clust column with the observation assigned cluster:

km$cluster
## # A tibble: 500 × 2
##    id    clust
##    <chr> <int>
##  1 1         1
##  2 2         1
##  3 3         1
##  4 4         1
##  5 5         1
##  6 6         1
##  7 7         1
##  8 8         1
##  9 9         1
## 10 10        1
## # … with 490 more rows

km$size contains tibble with clust column and n column with the number of points in each cluster:

km$size
## # A tibble: 5 × 2
##   clust     n
##   <int> <int>
## 1     1   102
## 2     2   100
## 3     3    98
## 4     4   100
## 5     5   100

We can now check our clustering performance - fraction of observations that were classified correctly (Note that match_clusters function is internal to the package and is used only in this vignette):

d <- tglkmeans:::match_clusters(data, km, 5)
sum(d$true_clust == d$new_clust, na.rm=TRUE) / sum(!is.na(d$new_clust))
## [1] 0.988

And plot the results:

d %>% ggplot(aes(x=V1, y=V2, color=factor(new_clust), shape=factor(true_clust))) + 
    geom_point() + 
    scale_color_discrete(name='cluster') + 
    scale_shape_discrete(name='true cluster') + 
    geom_point(data=km$centers, size=7, color='black', shape='X')

Custom cluster ordering

By default, the clusters where ordered using the following function: hclust(dist(cor(t(centers)))) - hclust of the euclidean distance of the correlation matrix of the centers.

We can supply our own function to order the clusters using reorder_func argument. The function would be applied to each center and he clusters would be ordered by the result.

km <- TGL_kmeans_tidy(data %>% select(id, starts_with('V')), 
              k=5, 
              metric='euclid', 
              verbose=FALSE, 
              reorder_func=median)
km$centers
## # A tibble: 5 × 3
##   clust    V1    V2
##   <int> <dbl> <dbl>
## 1     1  1.03  1.06
## 2     2  2.00  1.98
## 3     3  2.97  3.03
## 4     4  4.02  3.99
## 5     5  5.02  5.00

Missing data

tglkmeans can deal with missing data, as long as at least one dimension is not missing. for example:

data$V1[sample(1:nrow(data), round(nrow(data)*0.2))] <- NA
data
## # A tibble: 500 × 4
##       id     V1    V2 true_clust
##    <int>  <dbl> <dbl>      <int>
##  1     1  1.31  0.806          1
##  2     2  1.47  1.33           1
##  3     3  1.37  0.762          1
##  4     4  1.12  1.23           1
##  5     5 NA     1.26           1
##  6     6  0.984 1.26           1
##  7     7  0.565 1.27           1
##  8     8  1.15  1.11           1
##  9     9  1.64  0.716          1
## 10    10  1.05  1.57           1
## # … with 490 more rows
km <- TGL_kmeans_tidy(data %>% select(id, starts_with('V')), 
              k=5, 
              metric='euclid', 
              verbose=FALSE)
d <- tglkmeans:::match_clusters(data, km, 5)
sum(d$true_clust == d$new_clust, na.rm=TRUE) / sum(!is.na(d$new_clust))
## [1] 0.964

and plotting the results (without the NA’s) we get:

d %>% ggplot(aes(x=V1, y=V2, color=factor(new_clust), shape=factor(true_clust))) + 
    geom_point() + 
    scale_color_discrete(name='cluster') + 
    scale_shape_discrete(name='true cluster') + 
    geom_point(data=km$centers, size=7, color='black', shape='X')
## Warning: Removed 100 rows containing missing values (geom_point).

High dimensions

Let’s move to higher dimensions (and higher noise):

data <- simulate_data(n=100, sd=0.3, nclust=30, dims=300)
km <- TGL_kmeans_tidy(data %>% select(id, starts_with('V')), 
    k=30, 
    metric='euclid', 
    verbose=FALSE)
d <- tglkmeans:::match_clusters(data, km, 30)
sum(d$true_clust == d$new_clust, na.rm=TRUE) / sum(!is.na(d$new_clust))
## [1] 1

Comparison with R vanilla kmeans

Let’s compare it to R vanilla kmeans:

km_standard <- kmeans(data %>% select(starts_with('V')), 30)
km_standard$clust <- tibble(id = 1:nrow(data), clust=km_standard$cluster)

d <- tglkmeans:::match_clusters(data, km_standard, 30)
sum(d$true_clust == d$new_clust, na.rm=TRUE) / sum(!is.na(d$new_clust))
## [1] 0.75

We can see that kmeans++ clusters significantly better than R vanilla kmeans.

Random seed

we can set the seed for the c++ random number generator, for reproducible results:

km1 <- TGL_kmeans_tidy(data %>% select(id, starts_with('V')), 
               k=30, 
               metric='euclid', 
               verbose=FALSE, 
               seed = 60427)
km2 <- TGL_kmeans_tidy(data %>% select(id, starts_with('V')), 
               k=30, 
               metric='euclid', 
               verbose=FALSE, 
               seed = 60427)
all(km1$centers[, -1] == km2$centers[, -1])
## [1] TRUE