Loading the packages

library(readxl)
library(dplyr)
library(ggplot2)
library(data.table)
library(gridExtra)
library(rlang)
library(stringr)
library(knitr)
library(kableExtra)

Loading the file

# Checking the sheets
excel_sheets('failures.xlsm')
## [1] "FAILURES"   "ORDERS"     "PRODUCTION"
# Failures
failures <- read_excel('failures.xlsm', sheet = 'FAILURES', skip = 2)
production <- read_excel('failures.xlsm', sheet = 'PRODUCTION', skip = 2)

Preparing the dataset

# Checking the type of variables
str(failures)
## Classes 'tbl_df', 'tbl' and 'data.frame':    676 obs. of  45 variables:
##  $ ORDER    : chr  "095598.AA" "095598.AA" "095598.AA" "095598.AA" ...
##  $ ID       : chr  "10-2092/90D" "10-2084/90C" "10-2084/90B" "10-2081/02C" ...
##  $ LENGTH   : num  222 202 202 202 204 202 202 202 202 102 ...
##  $ MACHINE  : chr  "BA_12" "BA_15" "BA_15" "BA_04" ...
##  $ DATE     : POSIXct, format: "2018-12-04" "2018-12-04" ...
##  $ 1        : logi  NA NA NA NA NA NA ...
##  $ 2        : chr  "22" NA NA NA ...
##  $ 3        : chr  NA NA NA NA ...
##  $ 4        : chr  "95, 120, 124" "152" NA NA ...
##  $ 5        : chr  NA NA "135" NA ...
##  $ 6        : chr  NA NA NA NA ...
##  $ 7        : chr  NA NA NA NA ...
##  $ 8        : chr  NA NA NA NA ...
##  $ 9        : chr  NA NA NA NA ...
##  $ 10       : chr  NA NA NA "130" ...
##  $ 11       : chr  NA NA NA NA ...
##  $ 12       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 13       : chr  NA NA NA NA ...
##  $ 14       : logi  NA NA NA NA NA NA ...
##  $ 15       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 16       : chr  "23" NA NA NA ...
##  $ 17       : chr  "147" NA NA NA ...
##  $ 18       : chr  NA NA NA NA ...
##  $ 19_NOTES : chr  NA NA NA NA ...
##  $ 19_LENGTH: chr  NA NA NA NA ...
##  $ 1_       : logi  NA NA NA NA NA NA ...
##  $ 2_       : num  1 NA NA NA NA 1 NA 1 NA NA ...
##  $ 3_       : num  NA NA NA NA 1 NA NA NA NA NA ...
##  $ 4_       : num  3 1 NA NA 1 NA 1 NA NA 1 ...
##  $ 5_       : num  NA NA 1 NA NA NA NA NA 1 1 ...
##  $ 6_       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 7_       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 8_       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 9_       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 10_      : num  NA NA NA 1 NA NA NA NA NA NA ...
##  $ 11_      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 12_      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 13_      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 14_      : logi  NA NA NA NA NA NA ...
##  $ 15_      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 16_      : num  1 NA NA NA NA NA NA NA NA NA ...
##  $ 17_      : num  1 NA NA NA NA NA NA NA NA 1 ...
##  $ 18_      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 19_      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ TOTAL    : num  6 1 1 1 2 1 1 1 1 3 ...
# Setting failures$MACHINE as categorical
failures$MACHINE <- as.factor(failures$MACHINE)

# Setting production$MACHINE and production$VERSION as categorical
production$MACHINE <- as.factor(production$MACHINE)
# Replacing VERSION NAs with 0
production$VERSION[is.na(production$VERSION)] <- 0
production$VERSION <- as.factor(production$VERSION)
# Setting 0 as BEFORE and 1 as AFTER
levels(production$VERSION) <- c('BEFORE', 'AFTER')

# Standardizing IDs of the failures set
head(production$ID)
## [1] "10-2754/91" "10-2737/91" "10-2743/90" "10-2759/90" "10-2736/91"
## [6] "10-2758/90"
head(failures$ID)
## [1] "10-2092/90D" "10-2084/90C" "10-2084/90B" "10-2081/02C" "10-2066/91D"
## [6] "10-2066/91C"
# To standardize the ID of failures set, it's only needed to remove the letter from the end of the ID
failures_original <- failures
failures$ID <- gsub('\\D$', '', failures$ID)

PERFOMANCE ANALYSES

Considering that the current situation regards to the failures of a production, it’s important to consider relational metrics intead of absolute: Perfomance instead of Production or Failures.
The performance metric that will be chosen is an adaptation of MTBF (Mean Tima Between Failures) concept.
For this project the performance will be measure by: Produced Length Between Failures.
To be able to calculate this performance, it’s important to discard failures registers that don’t have production registers.

Removing registers from the failures set which don’t have production registers.

registers <- as.vector(unlist(production$ID))
failures <- failures %>% filter(ID %in% registers)

Merging productions and failures data sets by ID

colnames(failures)
##  [1] "ORDER"     "ID"        "LENGTH"    "MACHINE"   "DATE"     
##  [6] "1"         "2"         "3"         "4"         "5"        
## [11] "6"         "7"         "8"         "9"         "10"       
## [16] "11"        "12"        "13"        "14"        "15"       
## [21] "16"        "17"        "18"        "19_NOTES"  "19_LENGTH"
## [26] "1_"        "2_"        "3_"        "4_"        "5_"       
## [31] "6_"        "7_"        "8_"        "9_"        "10_"      
## [36] "11_"       "12_"       "13_"       "14_"       "15_"      
## [41] "16_"       "17_"       "18_"       "19_"       "TOTAL"
production_failures <- merge(production, failures[, c(2, 6:45)], by = 'ID', all = T)
kable(head(production_failures, 10)) %>%
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed'),
                font_size = 9)
ID ORDER PRODUCT MACHINE DATE LENGTH VERSION 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19_NOTES 19_LENGTH 1_ 2_ 3_ 4_ 5_ 6_ 7_ 8_ 9_ 10_ 11_ 12_ 13_ 14_ 15_ 16_ 17_ 18_ 19_ TOTAL
10-0650/90 091284.AA TYPE A BA_12 2018-04-27 808 BEFORE NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
10-2019/91 094011.AA TYPE B CA_13 2018-12-03 1075 BEFORE NA NA NA 93 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 1
10-2019/91 094011.AA TYPE B CA_13 2018-12-03 1075 BEFORE NA 24 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 22, 75 NA NA NA NA 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 2 NA NA 3
10-2019/91 094011.AA TYPE B CA_13 2018-12-03 1075 BEFORE NA NA NA 88 62 NA NA NA NA NA NA NA NA NA NA NA 8 NA NA NA NA NA NA 1 1 NA NA NA NA NA NA NA NA NA NA NA 1 NA NA 3
10-2019/91 094011.AA TYPE B CA_13 2018-12-03 1075 BEFORE NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 66 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 1 NA NA 1
10-2019/91 094011.AA TYPE B CA_13 2018-12-03 1075 BEFORE NA NA NA NA 94 NA NA NA NA NA NA NA NA NA NA NA 13 NA NA NA NA NA NA NA 1 NA NA NA NA NA NA NA NA NA NA NA 1 NA NA 2
10-2090/91 095598.AA TYPE A BA_10 2018-12-04 797 BEFORE NA NA NA 48, 64 A 78, 83, 85, 144 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 5 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 5
10-2090/91 095598.AA TYPE A BA_10 2018-12-04 797 BEFORE NA NA NA 159 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 1
10-2092/91 095598.AA TYPE A BA_12 2018-12-05 784 BEFORE NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 43 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 1 NA NA NA 1
10-2092/91 095598.AA TYPE A BA_12 2018-12-05 784 BEFORE NA NA NA 20, 26, 34 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 3 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 3
colnames(production_failures)
##  [1] "ID"        "ORDER"     "PRODUCT"   "MACHINE"   "DATE"     
##  [6] "LENGTH"    "VERSION"   "1"         "2"         "3"        
## [11] "4"         "5"         "6"         "7"         "8"        
## [16] "9"         "10"        "11"        "12"        "13"       
## [21] "14"        "15"        "16"        "17"        "18"       
## [26] "19_NOTES"  "19_LENGTH" "1_"        "2_"        "3_"       
## [31] "4_"        "5_"        "6_"        "7_"        "8_"       
## [36] "9_"        "10_"       "11_"       "12_"       "13_"      
## [41] "14_"       "15_"       "16_"       "17_"       "18_"      
## [46] "19_"       "TOTAL"
production_failures[, c(29:47)][is.na(production_failures[, c(29:47)])] <- 0
kable(head(production_failures, 10)) %>%
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed'),
                font_size = 9)
ID ORDER PRODUCT MACHINE DATE LENGTH VERSION 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19_NOTES 19_LENGTH 1_ 2_ 3_ 4_ 5_ 6_ 7_ 8_ 9_ 10_ 11_ 12_ 13_ 14_ 15_ 16_ 17_ 18_ 19_ TOTAL
10-0650/90 091284.AA TYPE A BA_12 2018-04-27 808 BEFORE NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10-2019/91 094011.AA TYPE B CA_13 2018-12-03 1075 BEFORE NA NA NA 93 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
10-2019/91 094011.AA TYPE B CA_13 2018-12-03 1075 BEFORE NA 24 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 22, 75 NA NA NA NA 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 3
10-2019/91 094011.AA TYPE B CA_13 2018-12-03 1075 BEFORE NA NA NA 88 62 NA NA NA NA NA NA NA NA NA NA NA 8 NA NA NA NA 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 3
10-2019/91 094011.AA TYPE B CA_13 2018-12-03 1075 BEFORE NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 66 NA NA NA NA 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1
10-2019/91 094011.AA TYPE B CA_13 2018-12-03 1075 BEFORE NA NA NA NA 94 NA NA NA NA NA NA NA NA NA NA NA 13 NA NA NA NA 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2
10-2090/91 095598.AA TYPE A BA_10 2018-12-04 797 BEFORE NA NA NA 48, 64 A 78, 83, 85, 144 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5
10-2090/91 095598.AA TYPE A BA_10 2018-12-04 797 BEFORE NA NA NA 159 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
10-2092/91 095598.AA TYPE A BA_12 2018-12-05 784 BEFORE NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 43 NA NA NA NA NA 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1
10-2092/91 095598.AA TYPE A BA_12 2018-12-05 784 BEFORE NA NA NA 20, 26, 34 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3

Calculating the performance

production_failures <- production_failures %>%
  mutate(PERFORMANCE = if_else(TOTAL == 0, LENGTH, LENGTH / TOTAL))
kable(head(production_failures, 10)) %>%
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed'),
                font_size = 9)
ID ORDER PRODUCT MACHINE DATE LENGTH VERSION 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19_NOTES 19_LENGTH 1_ 2_ 3_ 4_ 5_ 6_ 7_ 8_ 9_ 10_ 11_ 12_ 13_ 14_ 15_ 16_ 17_ 18_ 19_ TOTAL PERFORMANCE
10-0650/90 091284.AA TYPE A BA_12 2018-04-27 808 BEFORE NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 808.0000
10-2019/91 094011.AA TYPE B CA_13 2018-12-03 1075 BEFORE NA NA NA 93 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1075.0000
10-2019/91 094011.AA TYPE B CA_13 2018-12-03 1075 BEFORE NA 24 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 22, 75 NA NA NA NA 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 3 358.3333
10-2019/91 094011.AA TYPE B CA_13 2018-12-03 1075 BEFORE NA NA NA 88 62 NA NA NA NA NA NA NA NA NA NA NA 8 NA NA NA NA 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 3 358.3333
10-2019/91 094011.AA TYPE B CA_13 2018-12-03 1075 BEFORE NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 66 NA NA NA NA 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1075.0000
10-2019/91 094011.AA TYPE B CA_13 2018-12-03 1075 BEFORE NA NA NA NA 94 NA NA NA NA NA NA NA NA NA NA NA 13 NA NA NA NA 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 537.5000
10-2090/91 095598.AA TYPE A BA_10 2018-12-04 797 BEFORE NA NA NA 48, 64 A 78, 83, 85, 144 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 159.4000
10-2090/91 095598.AA TYPE A BA_10 2018-12-04 797 BEFORE NA NA NA 159 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 797.0000
10-2092/91 095598.AA TYPE A BA_12 2018-12-05 784 BEFORE NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 43 NA NA NA NA NA 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 784.0000
10-2092/91 095598.AA TYPE A BA_12 2018-12-05 784 BEFORE NA NA NA 20, 26, 34 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 261.3333

Checking the occurrences of each failure before the changes

num_failures_before = production_failures[, c(7, 28:46)] %>% filter(VERSION == 'BEFORE')
num_failures_before[is.na(num_failures_before)] <- 0
num_failures_before <- num_failures_before[, -1]
num_failures_before <- data.frame(cbind(
  FAILURES = as.factor(gsub('_', '', colnames(production_failures[, c(28:46)]))),
  FREQUENCY = apply(num_failures_before, 2, sum)))
num_failures_before <- num_failures_before[order(-num_failures_before$FREQUENCY),]
num_failures_before <- num_failures_before %>% mutate(
  CUM_FAILURES = cumsum(FREQUENCY),
  CUM_PERC = round((CUM_FAILURES / sum(FREQUENCY) * 100), 2)
)

# Plotting the occurrences
ggplot(num_failures_before, aes(x = reorder(FAILURES, CUM_FAILURES))) +
  geom_col(aes(y = FREQUENCY), fill = 'lightblue', color = 'darkblue', alpha = 0.7, group = 1) +
  ggtitle('FREQUENCY OF FAILURES BEFORE THE CHANGES') +
  geom_line(aes(y = CUM_PERC*264/100), group = 2, color = 'darkred') +
  geom_point(aes(y = CUM_PERC*264/100), group = 2, shape = 21, color = 'darkred', fill = 'darkred', size = 2) +
  geom_text(aes(y = CUM_PERC*264/100, label = paste(CUM_PERC, '%', sep = '')), vjust = -1, size = 3) +
  scale_y_continuous(sec.axis = sec_axis(~.*100/264, name = 'Cumulative percentage'), limits = c(0, 264)) +
  theme(plot.title = element_text(vjust = 0.5, hjust = 0.5, face = 'bold'))

As shown in the plot, 6 failures represent about 80% of the total of failures BEFORE the changes: 4, 3, 16, 2, 5 and 17.

Checking the occurrences of each failure after the changes

num_failures_after = production_failures[, c(7, 28:46)] %>% filter(VERSION == 'AFTER')
num_failures_after[is.na(num_failures_after)] <- 0
num_failures_after <- num_failures_after[, -1]
num_failures_after <- data.frame(cbind(
  FAILURES = as.factor(gsub('_', '', colnames(production_failures[, c(28:46)]))),
  FREQUENCY = apply(num_failures_after, 2, sum)))
num_failures_after <- num_failures_after[order(-num_failures_after$FREQUENCY),]
num_failures_after <- num_failures_after %>% mutate(
  CUM_FAILURES = cumsum(FREQUENCY),
  CUM_PERC = round((CUM_FAILURES / sum(FREQUENCY) * 100), 2)
)

# Plotting the occurrences
ggplot(num_failures_after, aes(x = reorder(FAILURES, -FREQUENCY))) +
  geom_col(aes(y = FREQUENCY), fill = 'lightblue', color = 'darkblue', alpha = 0.7, group = 1) +
  ggtitle('FREQUENCY OF FAILURES AFTER THE CHANGES') +
  geom_line(aes(y = CUM_PERC*61/100), group = 2, color = 'darkred') +
  geom_point(aes(y = CUM_PERC*61/100), group = 2, shape = 21, color = 'darkred', fill = 'darkred', size = 2) +
  geom_text(aes(y = CUM_PERC*61/100, label = paste(CUM_PERC, '%', sep = '')), vjust = -1, size = 3) +
  scale_y_continuous(sec.axis = sec_axis(~.*100/61, name = 'Cumulative percentage'), limits = c(0, 61)) +
  theme(plot.title = element_text(vjust = 0.5, hjust = 0.5, face = 'bold'))

After the changes there are again 6 failures which represent about 80% of the total.
But only the failure 16 from the list of before changes is repeated.
It shows that probably the changes have contributed to the improvements of that failures.

Even though only the mean of performance can’t be enough to make decisions, it’s a simple way to check the evolution between before and after modifications.

Mean of Performance before and after changes

ggplot(distinct(production_failures %>%
           group_by(VERSION) %>%
           mutate(PERFORMANCE = mean(PERFORMANCE)) %>%
           select(VERSION, PERFORMANCE),
         VERSION, .keep_all = T)) +
  geom_col(aes(x = VERSION, y = PERFORMANCE), fill = 'lightblue', color = 'darkblue', alpha = 0.7) +
  ggtitle('MEAN OF PERFORMANCE BEFORE AND AFTER MODIFICATIONS') +
  theme(plot.title = element_text(vjust = 0.5, hjust = 0.5, face = 'bold'))

As expected, the mean of performance after modifications is higher than before
Function to plot the comparisons of performance between before and after

compare_performances <- function(df, variable){
  target <- parse_quosure(variable) #quo
  # Listing the occurrences of variable to be compared
  list_to_compare <- as.vector(df[, variable])
  
  lapply(list_to_compare, function(x){
    before = ggplot(df %>% filter(!!target == x), aes(x = !!target)) +
      geom_col(aes(y = BEFORE), fill = 'lightblue', color = 'darkblue', alpha = 0.7) +
      ggtitle('PERFORMANCE BEFORE CHANGES') +
      geom_text(aes(y = BEFORE, label = round(BEFORE)), vjust = -0.5, size = 3) +
      theme(plot.title = element_text(vjust = 0.5, hjust = 0.5, face = 'bold')) +
      ylim(0, if_else(max(df$BEFORE) > max(df$AFTER), max(df$BEFORE), max(df$AFTER)))
    
    after = ggplot(df %>% filter(!!target == x), aes(x = !!target)) +
      geom_col(aes(y = AFTER), fill = 'lightblue', color = 'darkblue', alpha = 0.7) +
      ggtitle('PERFORMANCE AFTER CHANGES') +
      geom_text(aes(y = AFTER, label = round(AFTER)), vjust = -0.5, size = 3) +
      theme(plot.title = element_text(vjust = 0.5, hjust = 0.5, face = 'bold')) +
      ylim(0, if_else(max(df$BEFORE) > max(df$AFTER), max(df$BEFORE), max(df$AFTER)))
    
    grid.arrange(before, after, ncol = 2)
  })
}

Mean of Performance before and after changes by product

product_performance <- dcast(distinct(production_failures %>%
                                        group_by(PRODUCT, VERSION) %>%
                                        mutate(PERFORMANCE = mean(PERFORMANCE)) %>%
                                        select(PRODUCT, VERSION, PERFORMANCE),
                                      PRODUCT, VERSION, PERFORMANCE, .keep_all = T),
                             PRODUCT ~ VERSION)
## Using 'PERFORMANCE' as value column. Use 'value.var' to override
# Replacing NA values with 0
product_performance[is.na(product_performance)] <- 0
# Calculating the evolution of performance for products that was produced both before and after changes
product_performance <- product_performance %>%
  mutate(DIF_PERFORMANCE = if_else(BEFORE == 0 | AFTER == 0, 0, AFTER - BEFORE),
         STATUS_EVOLUTION = if_else(DIF_PERFORMANCE > 0, 1, -1))
# Plotting the performances
compare_performances(product_performance, 'PRODUCT')

product_performance

Having 0 performance, it means that the production don’t repeat for both versions.
Either it stoped the production after changes or it started the production.
But keeping these products in the dataset is quite important to check the variation of performances.

Checking the evolution between products

ggplot(product_performance %>% filter(DIF_PERFORMANCE != 0)) +
  geom_col(aes(x = PRODUCT,
               y = DIF_PERFORMANCE,
               fill = factor(STATUS_EVOLUTION, labels = c('Worse', 'Better'))), alpha = 0.5) +
  ggtitle('EVOLUTION OF PERFORMANCE BY PRODUCT') +
  labs(fill = 'Situation') +
  geom_text(aes(x = PRODUCT, y = DIF_PERFORMANCE, label = round(DIF_PERFORMANCE, 2)), vjust = -0.5, size = 3) +
  theme(plot.title = element_text(vjust = 0.5, hjust = 0.5, face = 'bold'),
        legend.position = 'bottom',
        legend.title = element_text(face = 'bold'))

Between the products with production before and after changes, only TYPE A got worse.
List of products which has gotten worse

worse_products <- as.vector(unlist(product_performance %>% filter(DIF_PERFORMANCE < 0) %>% select(PRODUCT)))

List of products which has improved

best_products <- as.vector(unlist(product_performance %>% filter(DIF_PERFORMANCE > 0) %>% select(PRODUCT)))

MEAN OF PERFORMANCE BEFORE AND AFTER CHANGES BY MACHINE

machine_performance <- dcast(distinct(production_failures %>%
                                        group_by(MACHINE, VERSION) %>%
                                        mutate(PERFORMANCE = mean(PERFORMANCE)) %>%
                                        select(MACHINE, VERSION, PERFORMANCE),
                                      MACHINE, VERSION, PERFORMANCE, .keep_all = T),
                             MACHINE ~ VERSION)
## Using 'PERFORMANCE' as value column. Use 'value.var' to override
# Replacing NA values with 0
machine_performance[is.na(machine_performance)] <- 0
# Calculating the evolution of performance for machines that was produced both before and after changes
machine_performance <- machine_performance %>%
  mutate(DIF_PERFORMANCE = if_else(BEFORE == 0 | AFTER == 0, 0, AFTER - BEFORE),
         STATUS_EVOLUTION = if_else(DIF_PERFORMANCE > 0, 1, -1))
# Plotting the performances
compare_performances(machine_performance, 'MACHINE')

Having 0 of performance, it means that the production wont repeat for both versions.
Either it stoped the production after changes or it started the production.
But keeping these machines in the dataset is quite important to check the variation of performances.

Checking the evolution between machines

ggplot(machine_performance %>% filter(DIF_PERFORMANCE != 0)) +
  geom_col(aes(x = MACHINE,
               y = DIF_PERFORMANCE,
               fill = factor(STATUS_EVOLUTION, labels = c('Worse', 'Better'))), alpha = 0.5) +
  ggtitle('EVOLUTION OF PERFORMANCE BY MACHINE') +
  labs(fill = 'Situation') +
  geom_text(aes(x = MACHINE, y = DIF_PERFORMANCE, label = round(DIF_PERFORMANCE, 2)), vjust = -0.5, size = 3) +
  theme(plot.title = element_text(vjust = 0.5, hjust = 0.5, face = 'bold'),
        legend.position = 'bottom',
        legend.title = element_text(face = 'bold'))

Between the machines with production before and after changes, three of them got worse: BA_13, BA_14, CA_06 and DT_12.
But for the whole department, the most machines improved.

List of machines which has gotten worse

worse_machines <- as.vector(unlist(machine_performance %>% filter(DIF_PERFORMANCE < 0) %>% select(MACHINE)))

List of machines which has improved

best_machines <- as.vector(unlist(machine_performance %>% filter(DIF_PERFORMANCE > 0) %>% select(MACHINE)))

Checking what machines have produced the worse products

demand_by_machine_worse_products <- dcast(distinct(production_failures %>%
           filter(PRODUCT %in% worse_products) %>%
           group_by(PRODUCT, MACHINE, VERSION) %>%
           mutate(PRODUCTION = sum(LENGTH)) %>%
           select(PRODUCT, MACHINE, VERSION, PRODUCTION),
         MACHINE, VERSION, .keep_all = T),
      MACHINE + PRODUCT ~ VERSION)
## Using 'PRODUCTION' as value column. Use 'value.var' to override
demand_by_machine_worse_products[is.na(demand_by_machine_worse_products)] <- 0
demand_by_machine_worse_products %>% mutate(DIF_DEMAND = if_else(BEFORE > AFTER, 'REDUCED', 'INCREASED'))
##    MACHINE PRODUCT BEFORE AFTER DIF_DEMAND
## 1    BA_01  TYPE A  20290     0    REDUCED
## 2    BA_04  TYPE A   9360     0    REDUCED
## 3    BA_05  TYPE A   6835     0    REDUCED
## 4    BA_06  TYPE A  24036     0    REDUCED
## 5    BA_07  TYPE A  40180  8073    REDUCED
## 6    BA_08  TYPE A  40334 11267    REDUCED
## 7    BA_09  TYPE A  16851     0    REDUCED
## 8    BA_10  TYPE A  25855     0    REDUCED
## 9    BA_12  TYPE A  15908  4800    REDUCED
## 10   BA_15  TYPE A  11300 10462    REDUCED
## 11   DT_11  TYPE A  36457     0    REDUCED
## 12   DT_12  TYPE A  83804 29557    REDUCED

As checked above, all the machines that produced the worse products have reduced the demand.
There are at least two basic conclusions:
1st: The machines have not to do with the descreasement of the mean of performance
2nd: As shorter the products are, as more it’s fails

To test this second one, it’s important to check if the demand by machine of the best products have increased.

demand_by_machine_best_products <- dcast(distinct(production_failures %>%
                                                    filter(PRODUCT %in% best_products) %>%
                                                    group_by(PRODUCT, MACHINE, VERSION) %>%
                                                    mutate(PRODUCTION = sum(LENGTH)) %>%
                                                    select(PRODUCT, MACHINE, VERSION, PRODUCTION),
                                                  MACHINE, VERSION, .keep_all = T),
                                         MACHINE + PRODUCT ~ VERSION)
## Using 'PRODUCTION' as value column. Use 'value.var' to override
demand_by_machine_best_products[is.na(demand_by_machine_best_products)] <- 0
demand_by_machine_best_products <- demand_by_machine_best_products %>%
  mutate(DIF_DEMAND = if_else(BEFORE > AFTER, 'REDUCED', 'INCREASED'))
kable(table(demand_by_machine_best_products$DIF_DEMAND))
Var1 Freq
INCREASED 9
REDUCED 12


In general, the most cases show that the demand has decreased.

But to assume it’s true, it’s important to measure the difference of length mean.

demand_by_machine_best_products <- dcast(distinct(production_failures %>%
                                                    filter(PRODUCT %in% best_products) %>%
                                                    group_by(PRODUCT, MACHINE, ID, VERSION) %>%
                                                    mutate(PRODUCTION = mean(LENGTH)) %>%
                                                    select(PRODUCT, MACHINE, ID, VERSION, PRODUCTION),
                                                  MACHINE, VERSION, .keep_all = T),
                                         MACHINE + ID + PRODUCT ~ VERSION)
## Using 'PRODUCTION' as value column. Use 'value.var' to override
kable(demand_by_machine_best_products) %>%
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed'),
                font_size = 9)
MACHINE ID PRODUCT BEFORE AFTER
BA_02 11-1078/90 TYPE B NA 1020
BA_02 11-1078/91 TYPE B NA 1014
BA_08 11-0966/93 TYPE B NA 513
BA_09 10-2828/90 TYPE B 286 NA
BA_09 11-0966/90 TYPE B NA 1020
BA_09 11-0966/91 TYPE B NA 1020
BA_09 11-0966/92 TYPE B NA 1020
BA_09 11-1087/90 TYPE B NA 1122
BA_09 11-1087/91 TYPE B NA 1032
BA_10 11-0975/90 TYPE B 1020 NA
BA_10 11-0975/91 TYPE B NA 1020
BA_10 11-0975/92 TYPE B 1020 NA
BA_10 11-0975/93 TYPE B 547 NA
BA_13 11-1018/90 TYPE B 1020 NA
BA_13 11-1018/91 TYPE B NA 1007
BA_14 11-0192/90 TYPE C 1017 NA
BA_14 11-0192/91 TYPE C 1203 NA
BA_17 11-1088/90 TYPE C NA 1043
CA_05 10-2828/91 TYPE B 519 NA
CA_05 11-0191/90 TYPE C 1020 NA
CA_05 11-0191/91 TYPE C 1031 NA
CA_06 10-2803/90 TYPE C 1010 NA
CA_06 11-1020/90 TYPE C NA 1020
CA_06 11-1020/91 TYPE C NA 1137
CA_06 11-1090/90 TYPE B 1122 NA
CA_06 11-1090/91 TYPE B NA 1077
CA_08 11-0958/90 TYPE B NA 1020
CA_08 11-0958/91 TYPE B NA 1006
CA_08 11-0969/90 TYPE B 1020 NA
CA_08 11-0969/91 TYPE B 1020 NA
CA_08 11-1080/90 TYPE B 480 NA
CA_08 11-1080/91 TYPE B 916 NA
CA_10 11-1017/90 TYPE B 1020 NA
CA_10 11-1017/91 TYPE B 989 NA
CA_10 11-1019/90 TYPE C NA 1020
CA_10 11-1019/91 TYPE C NA 1175
CA_11 11-1015/90 TYPE B NA 1020
CA_11 11-1015/91 TYPE B 1025 NA
CA_12 11-1026/90 TYPE C NA 1020
CA_12 11-1026/92 TYPE C NA 1195
CA_12 11-1093/90 TYPE B NA 1122
CA_12 11-1093/91 TYPE B NA 892
CA_13 10-2019/91 TYPE B 1075 NA
CA_15 10-2805/90 TYPE C 1002 NA
DT_11 11-0956/90 TYPE B NA 136
DT_11 11-0956/91 TYPE B NA 884
DT_11 11-0956/92 TYPE B NA 1020
DT_11 11-0956/93 TYPE B NA 1526
DT_11 11-0973/90 TYPE B NA 940
DT_11 11-0973/91 TYPE B NA 1020
DT_11 11-0973/92 TYPE B NA 1020
DT_11 11-0973/93 TYPE B NA 587
DT_11 11-1089/90 TYPE C 1136 NA
DT_11 11-1095/90 TYPE C 825 NA

Calculating the difference between length mean before and after changes

mean(demand_by_machine_best_products$AFTER, na.rm = T)-mean(demand_by_machine_best_products$BEFORE, na.rm = T)
## [1] 62.20337

Even it’s shown that the products have incresead the length by unit, 62 meters aren’t enough to affirm that the length has impact to the number of failures.
To finally check if there are a correlation between the length of products and number of failures, it will be calculated the correlation coefficient.

ggplot(production_failures) +
  geom_point(aes(x = LENGTH, y = TOTAL), fill = 'lightblue', color = 'darkblue', alpha = 0.7) +
  ggtitle('CORRELATION BETWEEN LENGTH OF PRODUCT AND FAILURES OCCURRENCES') +
  labs(subtitle = paste('Correlation coefficient',
                        round(cor(production_failures$LENGTH,
                            production_failures$TOTAL,
                            method = c('pearson', 'kendall', 'spearman')),2))) +
  theme(plot.title = element_text(vjust = 0.5, hjust = 0.5, face='bold'),
        plot.subtitle = element_text(vjust = 0.5, hjust = 0.5))


With a correlation coefficient close to 0, it’s proved that there aren’t a a strong correlation between the length of the product with the number of failures.
So, at this point, the first suggestion that the machines have not to do with the performance decreasement of the product TYPE A is being considered true, and the second one isn’t comproved.

ANALYSIS OF FREQUENCE OF FAILURES BY SECTIONS OF THE PRODUCT.

Since the product can be measured by length, by analysing the number of failures per secction can bring up some insights that could help to understand if the failures occur more at the begin of the proccess, at the middle of the proccess or at the end of the proccess.

Calculating the occurrences by secction of the product

Considering that the failures ID has a letter at the end, and this letters represent the secction of the product in a sequential order (A = begin, B = 2nd part, C = 3rd part, …) it may be used to calculate if the products have a tendence of more failures in a specific secction.

failures_by_section <- failures_original
failures_by_section <- failures_by_section %>% mutate(ID = str_extract(ID, '\\w$'))

Checking unique values

unique(failures_by_section$ID)
##  [1] "D" "C" "B" "A" "E" "F" "H" "I" "1" "J" "G" "0" "K"

0 and 1 should not be part of this analysis, because it’s due to products that aren’t splitted

failures_by_section <- failures_by_section %>% filter(ID != '0' & ID != '1')

Checking the frequency by secction

kable(table(failures_by_section$ID))
Var1 Freq
A 159
B 133
C 128
D 130
E 31
F 16
G 16
H 23
I 14
J 18
K 5

The secctions between A and D has the most number of occurrences. It regards to the most products are splitted into only four secctions.
To guarantee a better annalysis, it’s being considered to discard secctions above D

failures_by_section <- failures_by_section %>% filter(
  ID == 'A' | ID == 'B' | ID == 'C' | ID == 'D'
)

Counting the occurrence of failures by secction

failures_by_section <- distinct(
  failures_by_section %>%
    group_by(ID) %>%
    mutate(FAILURES = sum(TOTAL)) %>%
    select(ID, FAILURES),
  ID, .keep_all = T)

# Plotting the failures by secction
ggplot(failures_by_section) + 
  geom_col(aes(x = ID, y = FAILURES), fill = 'lightblue', color = 'darkblue', alpha = 0.7) +
  ggtitle('FAILURES BY SECCTION') +
  geom_text(aes(x = ID, y = FAILURES, label = FAILURES), vjust = -0.5, size = 3) +
  theme(plot.title = element_text(vjust = 0.5, hjust = 0.5, face = 'bold'))

As shown in the plot in general the most failures are at the begin of the products.
At this point it will be considered that it important to check if the setup of the machines have to do with this occurrences.

Checking if the performance has improved with the changes

To consider if the performance has improved, it’s important to select machines that have productions before and after the changes.

performance_comparison <- machine_performance %>%
  filter(STATUS_EVOLUTION == 1) %>%
  select(MACHINE, BEFORE, AFTER)

Testing the normality of the distribution of performance before and after changes

shapiro.test(performance_comparison$BEFORE)
## 
##  Shapiro-Wilk normality test
## 
## data:  performance_comparison$BEFORE
## W = 0.92412, p-value = 0.3926
shapiro.test(performance_comparison$AFTER)
## 
##  Shapiro-Wilk normality test
## 
## data:  performance_comparison$AFTER
## W = 0.97018, p-value = 0.8925

Since the both distribution was tested as normal, and the machines are the same, so the hypothesis test that will be reproduced is t test for parametric statistic.

The alternative hypothesis is: (mean of AFTER - mean of BEFORE) > 0
The confidence level is: 0.95

t.test(performance_comparison$AFTER, performance_comparison$BEFORE,
       alternative = 'greater',
       paired = T)
## 
##  Paired t-test
## 
## data:  performance_comparison$AFTER and performance_comparison$BEFORE
## t = 5.3593, df = 9, p-value = 0.0002284
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  132.0073      Inf
## sample estimates:
## mean of the differences 
##                200.6317

With a p-value reaching less than 0.05 the alternative hypothesis has been accepted, confirming that thechanges have made the difference in the process.

Plotting the results

performance_comparison <- melt(performance_comparison,
     id.vars = c('MACHINE'),
     variable.name = 'VERSION',
     value.name = 'PERFORMANCE',
     variable.factor = T)

ggplot(performance_comparison, aes(x = VERSION, y = PERFORMANCE, fill = VERSION)) +
  geom_boxplot(alpha = 0.5) +
  ggtitle('PERFORMANCE BEFORE AND AFTER CHANGES IN THE PROCESS') +
  stat_summary(fun.y = mean, colour = 'red', geom = 'point', shape = 3, size = 2) +
  theme(plot.title = element_text(vjus = 0.5, hjust = 0.5, face = 'bold'),
        legend.position = 'bottom',
        legend.title = element_text(face = 'bold'))

FINAL CONCLUSIONS

1 - There’s a evidence that probably the changes have contributed to reduce the frequency of the failures 4, 3, 2, 5 and 17.
2 - The failures aren’t centered in specific machines.
3 - The most failures are concentrated at the beginning of the products.
4 - The changes have done difference and contributed to improve the performance.