Data Setup
dataset <- read.csv("../datasets/jobs_complete.csv", sep=";") %>%
filter(hl_hard_skills != "") %>%
mutate(title = as.character(title)) %>%
mutate(description = as.character(description)) %>%
mutate(nchars_title = nchar(title)) %>%
mutate(nchars_desc = nchar(description))
roles_dataset <- dataset %>%
mutate(role = str_split(roles, " ")) %>%
unnest %>%
select(id, role, created_at, nchars_title, nchars_desc)
hs_dataset <- dataset %>%
mutate(hard_skill = str_split(hard_skills, " ")) %>%
unnest %>%
mutate(role = str_split(roles, " ")) %>%
unnest %>%
select(id, role, hard_skill)
hlhs_dataset <- dataset %>%
mutate(role = str_split(roles, " ")) %>%
unnest %>%
mutate(hl_hard_skill = str_split(hl_hard_skills, "\n")) %>%
unnest %>%
select(id, role, hl_hard_skill)
ss_dataset <- dataset %>%
filter(soft_skills != "") %>%
mutate(soft_skill = str_split(soft_skills, "\n")) %>%
unnest %>%
select(id, soft_skill)
Data Collection
- Number of jobs: 20968
- Number of hard skills: 275
- Number of high-level hard skills: 6
- First job post of our analysis was posted at: 2019-03-25
- Last job post of our analysis was posted at: 2019-06-28
- Number of jobs with at least one soft skill: 314 out of 376.
Jobs in each role
roles_dataset %>%
group_by(role) %>%
summarise(
n = n(),
nchars_median = median(nchars_desc)
) %>%
mutate(p = n / sum(n)) %>%
arrange(desc(n)) %>%
as.data.frame
## role n nchars_median p
## 1 FullStackDeveloper 4187 2121.0 0.186204750
## 2 BackendDeveloper 4020 2197.0 0.178777906
## 3 SystemAdministrator 3659 3236.0 0.162723472
## 4 FrontendDeveloper 2497 2329.0 0.111046874
## 5 MobileDeveloper 1721 2611.0 0.076536512
## 6 QATestDeveloper 1386 2651.5 0.061638353
## 7 DevOpsDeveloper 1355 2958.0 0.060259717
## 8 DatabaseAdministrator 1354 2689.0 0.060215245
## 9 DesktopDeveloper 661 2205.0 0.029396069
## 10 DataScientist 522 2800.5 0.023214445
## 11 EmbeddedDeveloper 474 2566.5 0.021079783
## 12 ProductManager 313 3497.0 0.013919772
## 13 GameDeveloper 244 2780.0 0.010851196
## 14 Designer 93 2764.0 0.004135907
roles_dataset %>%
count(role) %>%
arrange(desc(n)) %>%
ggplot(aes(x = reorder(role, n), y = n)) +
geom_hline(yintercept = 3000, linetype = "dashed", color = "grey", size = 0.5) +
geom_hline(yintercept = 1000, linetype = "dashed", color = "grey", size = 0.5) +
geom_point(size = 3) +
geom_segment(aes(xend = role, y = 0, yend = n)) +
ylab("Number of Jobs") +
xlab("Developer Roles") +
theme_classic() +
coord_flip()

Size of Jobs
- Average number of characters in title: 64
- Average number of characters in description:2530.5
roles_dataset %>%
ggplot(aes(x = reorder(role, nchars_title, FUN = median), y = nchars_title)) +
geom_boxplot(outlier.alpha = 0.1) +
xlab("Developer Roles") +
ylab("Number of characters in title") +
theme_classic() +
coord_flip()

roles_dataset %>%
filter(nchars_desc <= 20000) %>%
ggplot(aes(x = reorder(role, nchars_desc, FUN = median), y = nchars_desc)) +
geom_boxplot(outlier.alpha = 0.1) +
xlab("Developer Roles") +
ylab("Number of characters in description") +
theme_classic() +
theme(
axis.line.y = element_blank(),
axis.ticks.y = element_blank()
) +
coord_flip()

Hard Skills
Which hard skills are more required?
rq1_dataset <- hs_dataset %>%
distinct(id, hard_skill) %>%
count(hard_skill, sort = TRUE) %>%
mutate(p = n / sum(.$n)) %>%
top_n(10, n)
rq1_dataset
## # A tibble: 10 x 3
## hard_skill n p
## <chr> <int> <dbl>
## 1 java 5458 0.0853
## 2 javascript 3256 0.0509
## 3 sql 2617 0.0409
## 4 python 2559 0.0400
## 5 reactjs 2312 0.0361
## 6 c# 1990 0.0311
## 7 amazon-web-services 1633 0.0255
## 8 .net 1597 0.0250
## 9 cloud 1501 0.0235
## 10 linux 1262 0.0197
ggplot(data=rq1_dataset, aes(x=reorder(hard_skill, n), y=n) ) +
geom_point(size = 3) +
geom_segment(aes(xend = hard_skill, y = 0, yend = n)) +
xlab("Hard Skills") +
ylab("Number of Jobs") +
theme_classic() +
coord_flip()

How important are the high-level hard skills for each role?
rq2_dataset <- hlhs_dataset %>%
group_by(role, hl_hard_skill, add = TRUE) %>%
summarise(n = n()) %>%
spread(hl_hard_skill, n) %>%
as.data.frame %>%
mutate_if(is.numeric, ~replace(., is.na(.), 0))
rq2_dataset %>%
gather(c(-role), key = hard_skill, value = n) %>%
as.data.frame %>%
group_by(role) %>%
mutate(p = round(n / sum(n), 3)) %>%
ggplot(aes(hard_skill, role)) +
geom_tile(aes(fill = p)) +
geom_text(aes(label = sprintf("%.1f%%", 100 * p)), size = 2.5, alpha = 0.6) +
scale_fill_distiller(type = "seq", palette = "Spectral", na.value = "grey50") +
labs(x = "High Level Hard Skills", y = "Developer Roles") +
guides(fill = FALSE, alpha = FALSE) +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.line = element_blank()
)

Soft Skills
n_jobs <- 376 # random sample with 95% confidence, p = 0.05
rq3_dataset <- ss_dataset %>%
count(soft_skill) %>%
arrange(desc(n)) %>%
mutate(p = n / n_jobs)
Which soft skills are most required?
rq3_dataset
## # A tibble: 10 x 3
## soft_skill n p
## <chr> <int> <dbl>
## 1 Teamwork 215 0.572
## 2 Communication 193 0.513
## 3 Responsibility 191 0.508
## 4 Flexibility 142 0.378
## 5 Work Ethic 122 0.324
## 6 Interpersonal Skills 47 0.125
## 7 Positive Attitude 29 0.0771
## 8 Integrity 22 0.0585
## 9 Courtesy 14 0.0372
## 10 Professionalism 9 0.0239
ggplot(rq3_dataset, aes(x = reorder(soft_skill, n), y = n)) +
geom_point(size = 3) +
geom_segment(aes(xend = soft_skill, y = 0, yend = n)) +
ylab("Number of Jobs") +
xlab("Soft Skills") +
theme_classic() +
coord_flip()
