Data Setup

dataset <- read.csv("../datasets/jobs_complete.csv", sep=";") %>%
  filter(hl_hard_skills != "") %>%
  mutate(title = as.character(title)) %>%
  mutate(description = as.character(description)) %>%
  mutate(nchars_title = nchar(title)) %>%
  mutate(nchars_desc = nchar(description))

roles_dataset <- dataset %>%
  mutate(role = str_split(roles, " ")) %>%
  unnest %>%
  select(id, role, created_at, nchars_title, nchars_desc)

hs_dataset <- dataset %>% 
  mutate(hard_skill = str_split(hard_skills, " ")) %>% 
  unnest %>% 
  mutate(role = str_split(roles, " ")) %>% 
  unnest %>% 
  select(id, role, hard_skill)

hlhs_dataset <- dataset %>%
  mutate(role = str_split(roles, " ")) %>% 
  unnest %>%
  mutate(hl_hard_skill = str_split(hl_hard_skills, "\n")) %>%
  unnest %>%
  select(id, role, hl_hard_skill)

ss_dataset <- dataset %>%
  filter(soft_skills != "") %>%
  mutate(soft_skill = str_split(soft_skills, "\n")) %>%
  unnest %>%
  select(id, soft_skill)

Data Collection

Jobs in each role

roles_dataset %>% 
  group_by(role) %>%
  summarise(
    n = n(), 
    nchars_median = median(nchars_desc) 
  ) %>% 
  mutate(p = n / sum(n)) %>%
  arrange(desc(n)) %>%
  as.data.frame
##                     role    n nchars_median           p
## 1     FullStackDeveloper 4187        2121.0 0.186204750
## 2       BackendDeveloper 4020        2197.0 0.178777906
## 3    SystemAdministrator 3659        3236.0 0.162723472
## 4      FrontendDeveloper 2497        2329.0 0.111046874
## 5        MobileDeveloper 1721        2611.0 0.076536512
## 6        QATestDeveloper 1386        2651.5 0.061638353
## 7        DevOpsDeveloper 1355        2958.0 0.060259717
## 8  DatabaseAdministrator 1354        2689.0 0.060215245
## 9       DesktopDeveloper  661        2205.0 0.029396069
## 10         DataScientist  522        2800.5 0.023214445
## 11     EmbeddedDeveloper  474        2566.5 0.021079783
## 12        ProductManager  313        3497.0 0.013919772
## 13         GameDeveloper  244        2780.0 0.010851196
## 14              Designer   93        2764.0 0.004135907
roles_dataset %>%
  count(role) %>%
  arrange(desc(n)) %>%
  ggplot(aes(x = reorder(role, n), y = n)) +
    geom_hline(yintercept = 3000, linetype = "dashed", color = "grey", size = 0.5) +
    geom_hline(yintercept = 1000, linetype = "dashed", color = "grey", size = 0.5) +
    geom_point(size = 3) +
    geom_segment(aes(xend = role, y = 0, yend = n)) +
    ylab("Number of Jobs") +
    xlab("Developer Roles") +
    theme_classic() +
    coord_flip()

Size of Jobs

  • Average number of characters in title: 64
  • Average number of characters in description:2530.5
roles_dataset %>%
  ggplot(aes(x = reorder(role, nchars_title, FUN = median), y = nchars_title)) +
    geom_boxplot(outlier.alpha = 0.1) +
    xlab("Developer Roles") +
    ylab("Number of characters in title") +
    theme_classic() +
    coord_flip()

roles_dataset %>%
  filter(nchars_desc <= 20000) %>%
  ggplot(aes(x = reorder(role, nchars_desc, FUN = median), y = nchars_desc)) +
    geom_boxplot(outlier.alpha = 0.1) +
    xlab("Developer Roles") +
    ylab("Number of characters in description") +
    theme_classic() +
    theme(
      axis.line.y = element_blank(),
      axis.ticks.y = element_blank()
    ) +
    coord_flip()

Hard Skills

Which hard skills are more required?

rq1_dataset <- hs_dataset %>% 
  distinct(id, hard_skill) %>% 
  count(hard_skill, sort = TRUE) %>%
  mutate(p = n / sum(.$n)) %>%
  top_n(10, n)
rq1_dataset
## # A tibble: 10 x 3
##    hard_skill              n      p
##    <chr>               <int>  <dbl>
##  1 java                 5458 0.0853
##  2 javascript           3256 0.0509
##  3 sql                  2617 0.0409
##  4 python               2559 0.0400
##  5 reactjs              2312 0.0361
##  6 c#                   1990 0.0311
##  7 amazon-web-services  1633 0.0255
##  8 .net                 1597 0.0250
##  9 cloud                1501 0.0235
## 10 linux                1262 0.0197
ggplot(data=rq1_dataset, aes(x=reorder(hard_skill, n), y=n) ) +
  geom_point(size = 3) +
  geom_segment(aes(xend = hard_skill, y = 0, yend = n)) +
  xlab("Hard Skills") +
  ylab("Number of Jobs") +
  theme_classic() +
  coord_flip()

How important are the high-level hard skills for each role?

rq2_dataset <- hlhs_dataset %>%
  group_by(role, hl_hard_skill, add = TRUE) %>%
  summarise(n = n()) %>%
  spread(hl_hard_skill, n) %>%
  as.data.frame %>%
  mutate_if(is.numeric, ~replace(., is.na(.), 0))
rq2_dataset %>%
  gather(c(-role), key = hard_skill, value = n) %>%
  as.data.frame %>%
  group_by(role) %>%
  mutate(p = round(n / sum(n), 3)) %>%
  ggplot(aes(hard_skill, role)) +
    geom_tile(aes(fill = p)) +
    geom_text(aes(label = sprintf("%.1f%%", 100 * p)), size = 2.5, alpha = 0.6) +
    scale_fill_distiller(type = "seq", palette = "Spectral", na.value = "grey50") +
    labs(x = "High Level Hard Skills", y = "Developer Roles") +
    guides(fill = FALSE, alpha = FALSE) +
    theme_classic() +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      axis.line = element_blank()
    )

Soft Skills

n_jobs <- 376 # random sample with 95% confidence, p = 0.05

rq3_dataset <- ss_dataset %>%
  count(soft_skill) %>%
  arrange(desc(n)) %>%
  mutate(p = n / n_jobs)

Which soft skills are most required?

rq3_dataset
## # A tibble: 10 x 3
##    soft_skill               n      p
##    <chr>                <int>  <dbl>
##  1 Teamwork               215 0.572 
##  2 Communication          193 0.513 
##  3 Responsibility         191 0.508 
##  4 Flexibility            142 0.378 
##  5 Work Ethic             122 0.324 
##  6 Interpersonal Skills    47 0.125 
##  7 Positive Attitude       29 0.0771
##  8 Integrity               22 0.0585
##  9 Courtesy                14 0.0372
## 10 Professionalism          9 0.0239
ggplot(rq3_dataset, aes(x = reorder(soft_skill, n), y = n)) +
  geom_point(size = 3) +
  geom_segment(aes(xend = soft_skill, y = 0, yend = n)) +
  ylab("Number of Jobs") +
  xlab("Soft Skills") +
  theme_classic() +
  coord_flip()