Failures_analysis

Loading the packages

library(readxl)
library(dplyr)
library(ggplot2)
library(data.table)
library(gridExtra)
library(rlang)
library(stringr)
library(knitr)
library(kableExtra)

Loading the file

# Checking the sheets
excel_sheets('failures.xlsm')

## [1] "FAILURES"   "ORDERS"     "PRODUCTION"

# Failures
failures <- read_excel('failures.xlsm', sheet = 'FAILURES', skip = 2)
production <- read_excel('failures.xlsm', sheet = 'PRODUCTION', skip = 2)

Preparing the dataset

# Checking the type of variables
str(failures)

## Classes 'tbl_df', 'tbl' and 'data.frame':    676 obs. of  45 variables:
##  $ ORDER    : chr  "095598.AA" "095598.AA" "095598.AA" "095598.AA" ...
##  $ ID       : chr  "10-2092/90D" "10-2084/90C" "10-2084/90B" "10-2081/02C" ...
##  $ LENGTH   : num  222 202 202 202 204 202 202 202 202 102 ...
##  $ MACHINE  : chr  "BA_12" "BA_15" "BA_15" "BA_04" ...
##  $ DATE     : POSIXct, format: "2018-12-04" "2018-12-04" ...
##  $ 1        : logi  NA NA NA NA NA NA ...
##  $ 2        : chr  "22" NA NA NA ...
##  $ 3        : chr  NA NA NA NA ...
##  $ 4        : chr  "95, 120, 124" "152" NA NA ...
##  $ 5        : chr  NA NA "135" NA ...
##  $ 6        : chr  NA NA NA NA ...
##  $ 7        : chr  NA NA NA NA ...
##  $ 8        : chr  NA NA NA NA ...
##  $ 9        : chr  NA NA NA NA ...
##  $ 10       : chr  NA NA NA "130" ...
##  $ 11       : chr  NA NA NA NA ...
##  $ 12       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 13       : chr  NA NA NA NA ...
##  $ 14       : logi  NA NA NA NA NA NA ...
##  $ 15       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 16       : chr  "23" NA NA NA ...
##  $ 17       : chr  "147" NA NA NA ...
##  $ 18       : chr  NA NA NA NA ...
##  $ 19_NOTES : chr  NA NA NA NA ...
##  $ 19_LENGTH: chr  NA NA NA NA ...
##  $ 1_       : logi  NA NA NA NA NA NA ...
##  $ 2_       : num  1 NA NA NA NA 1 NA 1 NA NA ...
##  $ 3_       : num  NA NA NA NA 1 NA NA NA NA NA ...
##  $ 4_       : num  3 1 NA NA 1 NA 1 NA NA 1 ...
##  $ 5_       : num  NA NA 1 NA NA NA NA NA 1 1 ...
##  $ 6_       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 7_       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 8_       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 9_       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 10_      : num  NA NA NA 1 NA NA NA NA NA NA ...
##  $ 11_      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 12_      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 13_      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 14_      : logi  NA NA NA NA NA NA ...
##  $ 15_      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 16_      : num  1 NA NA NA NA NA NA NA NA NA ...
##  $ 17_      : num  1 NA NA NA NA NA NA NA NA 1 ...
##  $ 18_      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 19_      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ TOTAL    : num  6 1 1 1 2 1 1 1 1 3 ...

# Setting failures$MACHINE as categorical
failures$MACHINE <- as.factor(failures$MACHINE)

# Setting production$MACHINE and production$VERSION as categorical
production$MACHINE <- as.factor(production$MACHINE)
# Replacing VERSION NAs with 0
production$VERSION[is.na(production$VERSION)] <- 0
production$VERSION <- as.factor(production$VERSION)
# Setting 0 as BEFORE and 1 as AFTER
levels(production$VERSION) <- c('BEFORE', 'AFTER')

# Standardizing IDs of the failures set
head(production$ID)

## [1] "10-2754/91" "10-2737/91" "10-2743/90" "10-2759/90" "10-2736/91"
## [6] "10-2758/90"

head(failures$ID)

## [1] "10-2092/90D" "10-2084/90C" "10-2084/90B" "10-2081/02C" "10-2066/91D"
## [6] "10-2066/91C"

# To standardize the ID of failures set, it's only needed to remove the letter from the end of the ID
failures_original <- failures
failures$ID <- gsub('\\D$', '', failures$ID)

PERFOMANCE ANALYSES

Considering that the current situation regards to the failures of a production, it’s important to consider relational metrics intead of absolute: Perfomance instead of Production or Failures.
The performance metric that will be chosen is an adaptation of MTBF (Mean Tima Between Failures) concept.
For this project the performance will be measure by: Produced Length Between Failures.
To be able to calculate this performance, it’s important to discard failures registers that don’t have production registers.

Removing registers from the failures set which don’t have production registers.

registers <- as.vector(unlist(production$ID))
failures <- failures %>% filter(ID %in% registers)

Merging productions and failures data sets by ID

colnames(failures)

##  [1] "ORDER"     "ID"        "LENGTH"    "MACHINE"   "DATE"     
##  [6] "1"         "2"         "3"         "4"         "5"        
## [11] "6"         "7"         "8"         "9"         "10"       
## [16] "11"        "12"        "13"        "14"        "15"       
## [21] "16"        "17"        "18"        "19_NOTES"  "19_LENGTH"
## [26] "1_"        "2_"        "3_"        "4_"        "5_"       
## [31] "6_"        "7_"        "8_"        "9_"        "10_"      
## [36] "11_"       "12_"       "13_"       "14_"       "15_"      
## [41] "16_"       "17_"       "18_"       "19_"       "TOTAL"

production_failures <- merge(production, failures[, c(2, 6:45)], by = 'ID', all = T)
kable(head(production_failures, 10)) %>%
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed'),
                font_size = 9)

ID	ORDER	PRODUCT	MACHINE	DATE	LENGTH	VERSION	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19_NOTES	19_LENGTH	1_	2_	3_	4_	5_	6_	7_	8_	9_	10_	11_	12_	13_	14_	15_	16_	17_	18_	19_	TOTAL
10-0650/90	091284.AA	TYPE A	BA_12	2018-04-27	808	BEFORE	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
10-2019/91	094011.AA	TYPE B	CA_13	2018-12-03	1075	BEFORE	NA	NA	NA	93	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	1	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	1
10-2019/91	094011.AA	TYPE B	CA_13	2018-12-03	1075	BEFORE	NA	24	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	22, 75	NA	NA	NA	NA	1	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	2	NA	NA	3
10-2019/91	094011.AA	TYPE B	CA_13	2018-12-03	1075	BEFORE	NA	NA	NA	88	62	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	8	NA	NA	NA	NA	NA	NA	1	1	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	1	NA	NA	3
10-2019/91	094011.AA	TYPE B	CA_13	2018-12-03	1075	BEFORE	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	66	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	1	NA	NA	1
10-2019/91	094011.AA	TYPE B	CA_13	2018-12-03	1075	BEFORE	NA	NA	NA	NA	94	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	13	NA	NA	NA	NA	NA	NA	NA	1	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	1	NA	NA	2
10-2090/91	095598.AA	TYPE A	BA_10	2018-12-04	797	BEFORE	NA	NA	NA	48, 64 A 78, 83, 85, 144	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	5	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	5
10-2090/91	095598.AA	TYPE A	BA_10	2018-12-04	797	BEFORE	NA	NA	NA	159	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	1	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	1
10-2092/91	095598.AA	TYPE A	BA_12	2018-12-05	784	BEFORE	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	43	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	1	NA	NA	NA	1
10-2092/91	095598.AA	TYPE A	BA_12	2018-12-05	784	BEFORE	NA	NA	NA	20, 26, 34	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	3	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	3

colnames(production_failures)

##  [1] "ID"        "ORDER"     "PRODUCT"   "MACHINE"   "DATE"     
##  [6] "LENGTH"    "VERSION"   "1"         "2"         "3"        
## [11] "4"         "5"         "6"         "7"         "8"        
## [16] "9"         "10"        "11"        "12"        "13"       
## [21] "14"        "15"        "16"        "17"        "18"       
## [26] "19_NOTES"  "19_LENGTH" "1_"        "2_"        "3_"       
## [31] "4_"        "5_"        "6_"        "7_"        "8_"       
## [36] "9_"        "10_"       "11_"       "12_"       "13_"      
## [41] "14_"       "15_"       "16_"       "17_"       "18_"      
## [46] "19_"       "TOTAL"

production_failures[, c(29:47)][is.na(production_failures[, c(29:47)])] <- 0
kable(head(production_failures, 10)) %>%
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed'),
                font_size = 9)

ID	ORDER	PRODUCT	MACHINE	DATE	LENGTH	VERSION	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19_NOTES	19_LENGTH	1_	2_	4_	5_	16_	17_	TOTAL
10-0650/90	091284.AA	TYPE A	BA_12	2018-04-27	808	BEFORE	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	0	0	0	0	0	0
10-2019/91	094011.AA	TYPE B	CA_13	2018-12-03	1075	BEFORE	NA	NA	NA	93	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	0	1	0	0	0	1
10-2019/91	094011.AA	TYPE B	CA_13	2018-12-03	1075	BEFORE	NA	24	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	22, 75	NA	NA	NA	NA	1	0	0	0	2	3
10-2019/91	094011.AA	TYPE B	CA_13	2018-12-03	1075	BEFORE	NA	NA	NA	88	62	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	8	NA	NA	NA	NA	0	1	1	0	1	3
10-2019/91	094011.AA	TYPE B	CA_13	2018-12-03	1075	BEFORE	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	66	NA	NA	NA	NA	0	0	0	0	1	1
10-2019/91	094011.AA	TYPE B	CA_13	2018-12-03	1075	BEFORE	NA	NA	NA	NA	94	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	13	NA	NA	NA	NA	0	0	1	0	1	2
10-2090/91	095598.AA	TYPE A	BA_10	2018-12-04	797	BEFORE	NA	NA	NA	48, 64 A 78, 83, 85, 144	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	0	5	0	0	0	5
10-2090/91	095598.AA	TYPE A	BA_10	2018-12-04	797	BEFORE	NA	NA	NA	159	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	0	1	0	0	0	1
10-2092/91	095598.AA	TYPE A	BA_12	2018-12-05	784	BEFORE	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	43	NA	NA	NA	NA	NA	0	0	0	1	0	1
10-2092/91	095598.AA	TYPE A	BA_12	2018-12-05	784	BEFORE	NA	NA	NA	20, 26, 34	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	0	3	0	0	0	3

Calculating the performance

production_failures <- production_failures %>%
  mutate(PERFORMANCE = if_else(TOTAL == 0, LENGTH, LENGTH / TOTAL))
kable(head(production_failures, 10)) %>%
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed'),
                font_size = 9)

ID	ORDER	PRODUCT	MACHINE	DATE	LENGTH	VERSION	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19_NOTES	19_LENGTH	1_	2_	4_	5_	16_	17_	TOTAL	PERFORMANCE
10-0650/90	091284.AA	TYPE A	BA_12	2018-04-27	808	BEFORE	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	0	0	0	0	0	0	808.0000
10-2019/91	094011.AA	TYPE B	CA_13	2018-12-03	1075	BEFORE	NA	NA	NA	93	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	0	1	0	0	0	1	1075.0000
10-2019/91	094011.AA	TYPE B	CA_13	2018-12-03	1075	BEFORE	NA	24	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	22, 75	NA	NA	NA	NA	1	0	0	0	2	3	358.3333
10-2019/91	094011.AA	TYPE B	CA_13	2018-12-03	1075	BEFORE	NA	NA	NA	88	62	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	8	NA	NA	NA	NA	0	1	1	0	1	3	358.3333
10-2019/91	094011.AA	TYPE B	CA_13	2018-12-03	1075	BEFORE	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	66	NA	NA	NA	NA	0	0	0	0	1	1	1075.0000
10-2019/91	094011.AA	TYPE B	CA_13	2018-12-03	1075	BEFORE	NA	NA	NA	NA	94	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	13	NA	NA	NA	NA	0	0	1	0	1	2	537.5000
10-2090/91	095598.AA	TYPE A	BA_10	2018-12-04	797	BEFORE	NA	NA	NA	48, 64 A 78, 83, 85, 144	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	0	5	0	0	0	5	159.4000
10-2090/91	095598.AA	TYPE A	BA_10	2018-12-04	797	BEFORE	NA	NA	NA	159	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	0	1	0	0	0	1	797.0000
10-2092/91	095598.AA	TYPE A	BA_12	2018-12-05	784	BEFORE	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	43	NA	NA	NA	NA	NA	0	0	0	1	0	1	784.0000
10-2092/91	095598.AA	TYPE A	BA_12	2018-12-05	784	BEFORE	NA	NA	NA	20, 26, 34	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	0	3	0	0	0	3	261.3333

Checking the occurrences of each failure before the changes

num_failures_before = production_failures[, c(7, 28:46)] %>% filter(VERSION == 'BEFORE')
num_failures_before[is.na(num_failures_before)] <- 0
num_failures_before <- num_failures_before[, -1]
num_failures_before <- data.frame(cbind(
  FAILURES = as.factor(gsub('_', '', colnames(production_failures[, c(28:46)]))),
  FREQUENCY = apply(num_failures_before, 2, sum)))
num_failures_before <- num_failures_before[order(-num_failures_before$FREQUENCY),]
num_failures_before <- num_failures_before %>% mutate(
  CUM_FAILURES = cumsum(FREQUENCY),
  CUM_PERC = round((CUM_FAILURES / sum(FREQUENCY) * 100), 2)
)

# Plotting the occurrences
ggplot(num_failures_before, aes(x = reorder(FAILURES, CUM_FAILURES))) +
  geom_col(aes(y = FREQUENCY), fill = 'lightblue', color = 'darkblue', alpha = 0.7, group = 1) +
  ggtitle('FREQUENCY OF FAILURES BEFORE THE CHANGES') +
  geom_line(aes(y = CUM_PERC*264/100), group = 2, color = 'darkred') +
  geom_point(aes(y = CUM_PERC*264/100), group = 2, shape = 21, color = 'darkred', fill = 'darkred', size = 2) +
  geom_text(aes(y = CUM_PERC*264/100, label = paste(CUM_PERC, '%', sep = '')), vjust = -1, size = 3) +
  scale_y_continuous(sec.axis = sec_axis(~.*100/264, name = 'Cumulative percentage'), limits = c(0, 264)) +
  theme(plot.title = element_text(vjust = 0.5, hjust = 0.5, face = 'bold'))

As shown in the plot, 6 failures represent about 80% of the total of failures BEFORE the changes: 4, 3, 16, 2, 5 and 17.

Checking the occurrences of each failure after the changes

num_failures_after = production_failures[, c(7, 28:46)] %>% filter(VERSION == 'AFTER')
num_failures_after[is.na(num_failures_after)] <- 0
num_failures_after <- num_failures_after[, -1]
num_failures_after <- data.frame(cbind(
  FAILURES = as.factor(gsub('_', '', colnames(production_failures[, c(28:46)]))),
  FREQUENCY = apply(num_failures_after, 2, sum)))
num_failures_after <- num_failures_after[order(-num_failures_after$FREQUENCY),]
num_failures_after <- num_failures_after %>% mutate(
  CUM_FAILURES = cumsum(FREQUENCY),
  CUM_PERC = round((CUM_FAILURES / sum(FREQUENCY) * 100), 2)
)

# Plotting the occurrences
ggplot(num_failures_after, aes(x = reorder(FAILURES, -FREQUENCY))) +
  geom_col(aes(y = FREQUENCY), fill = 'lightblue', color = 'darkblue', alpha = 0.7, group = 1) +
  ggtitle('FREQUENCY OF FAILURES AFTER THE CHANGES') +
  geom_line(aes(y = CUM_PERC*61/100), group = 2, color = 'darkred') +
  geom_point(aes(y = CUM_PERC*61/100), group = 2, shape = 21, color = 'darkred', fill = 'darkred', size = 2) +
  geom_text(aes(y = CUM_PERC*61/100, label = paste(CUM_PERC, '%', sep = '')), vjust = -1, size = 3) +
  scale_y_continuous(sec.axis = sec_axis(~.*100/61, name = 'Cumulative percentage'), limits = c(0, 61)) +
  theme(plot.title = element_text(vjust = 0.5, hjust = 0.5, face = 'bold'))

After the changes there are again 6 failures which represent about 80% of the total.
But only the failure 16 from the list of before changes is repeated.
It shows that probably the changes have contributed to the improvements of that failures.

Even though only the mean of performance can’t be enough to make decisions, it’s a simple way to check the evolution between before and after modifications.

Mean of Performance before and after changes

ggplot(distinct(production_failures %>%
           group_by(VERSION) %>%
           mutate(PERFORMANCE = mean(PERFORMANCE)) %>%
           select(VERSION, PERFORMANCE),
         VERSION, .keep_all = T)) +
  geom_col(aes(x = VERSION, y = PERFORMANCE), fill = 'lightblue', color = 'darkblue', alpha = 0.7) +
  ggtitle('MEAN OF PERFORMANCE BEFORE AND AFTER MODIFICATIONS') +
  theme(plot.title = element_text(vjust = 0.5, hjust = 0.5, face = 'bold'))

As expected, the mean of performance after modifications is higher than before
Function to plot the comparisons of performance between before and after

compare_performances <- function(df, variable){
  target <- parse_quosure(variable) #quo
  # Listing the occurrences of variable to be compared
  list_to_compare <- as.vector(df[, variable])
  
  lapply(list_to_compare, function(x){
    before = ggplot(df %>% filter(!!target == x), aes(x = !!target)) +
      geom_col(aes(y = BEFORE), fill = 'lightblue', color = 'darkblue', alpha = 0.7) +
      ggtitle('PERFORMANCE BEFORE CHANGES') +
      geom_text(aes(y = BEFORE, label = round(BEFORE)), vjust = -0.5, size = 3) +
      theme(plot.title = element_text(vjust = 0.5, hjust = 0.5, face = 'bold')) +
      ylim(0, if_else(max(df$BEFORE) > max(df$AFTER), max(df$BEFORE), max(df$AFTER)))
    
    after = ggplot(df %>% filter(!!target == x), aes(x = !!target)) +
      geom_col(aes(y = AFTER), fill = 'lightblue', color = 'darkblue', alpha = 0.7) +
      ggtitle('PERFORMANCE AFTER CHANGES') +
      geom_text(aes(y = AFTER, label = round(AFTER)), vjust = -0.5, size = 3) +
      theme(plot.title = element_text(vjust = 0.5, hjust = 0.5, face = 'bold')) +
      ylim(0, if_else(max(df$BEFORE) > max(df$AFTER), max(df$BEFORE), max(df$AFTER)))
    
    grid.arrange(before, after, ncol = 2)
  })
}

Mean of Performance before and after changes by product

product_performance <- dcast(distinct(production_failures %>%
                                        group_by(PRODUCT, VERSION) %>%
                                        mutate(PERFORMANCE = mean(PERFORMANCE)) %>%
                                        select(PRODUCT, VERSION, PERFORMANCE),
                                      PRODUCT, VERSION, PERFORMANCE, .keep_all = T),
                             PRODUCT ~ VERSION)

## Using 'PERFORMANCE' as value column. Use 'value.var' to override

# Replacing NA values with 0
product_performance[is.na(product_performance)] <- 0
# Calculating the evolution of performance for products that was produced both before and after changes
product_performance <- product_performance %>%
  mutate(DIF_PERFORMANCE = if_else(BEFORE == 0 | AFTER == 0, 0, AFTER - BEFORE),
         STATUS_EVOLUTION = if_else(DIF_PERFORMANCE > 0, 1, -1))
# Plotting the performances
compare_performances(product_performance, 'PRODUCT')

product_performance

Having 0 performance, it means that the production don’t repeat for both versions.
Either it stoped the production after changes or it started the production.
But keeping these products in the dataset is quite important to check the variation of performances.

Checking the evolution between products

ggplot(product_performance %>% filter(DIF_PERFORMANCE != 0)) +
  geom_col(aes(x = PRODUCT,
               y = DIF_PERFORMANCE,
               fill = factor(STATUS_EVOLUTION, labels = c('Worse', 'Better'))), alpha = 0.5) +
  ggtitle('EVOLUTION OF PERFORMANCE BY PRODUCT') +
  labs(fill = 'Situation') +
  geom_text(aes(x = PRODUCT, y = DIF_PERFORMANCE, label = round(DIF_PERFORMANCE, 2)), vjust = -0.5, size = 3) +
  theme(plot.title = element_text(vjust = 0.5, hjust = 0.5, face = 'bold'),
        legend.position = 'bottom',
        legend.title = element_text(face = 'bold'))

Between the products with production before and after changes, only TYPE A got worse.
List of products which has gotten worse

worse_products <- as.vector(unlist(product_performance %>% filter(DIF_PERFORMANCE < 0) %>% select(PRODUCT)))

List of products which has improved

best_products <- as.vector(unlist(product_performance %>% filter(DIF_PERFORMANCE > 0) %>% select(PRODUCT)))

MEAN OF PERFORMANCE BEFORE AND AFTER CHANGES BY MACHINE

machine_performance <- dcast(distinct(production_failures %>%
                                        group_by(MACHINE, VERSION) %>%
                                        mutate(PERFORMANCE = mean(PERFORMANCE)) %>%
                                        select(MACHINE, VERSION, PERFORMANCE),
                                      MACHINE, VERSION, PERFORMANCE, .keep_all = T),
                             MACHINE ~ VERSION)

## Using 'PERFORMANCE' as value column. Use 'value.var' to override

# Replacing NA values with 0
machine_performance[is.na(machine_performance)] <- 0
# Calculating the evolution of performance for machines that was produced both before and after changes
machine_performance <- machine_performance %>%
  mutate(DIF_PERFORMANCE = if_else(BEFORE == 0 | AFTER == 0, 0, AFTER - BEFORE),
         STATUS_EVOLUTION = if_else(DIF_PERFORMANCE > 0, 1, -1))
# Plotting the performances
compare_performances(machine_performance, 'MACHINE')

Having 0 of performance, it means that the production wont repeat for both versions.
Either it stoped the production after changes or it started the production.
But keeping these machines in the dataset is quite important to check the variation of performances.

Checking the evolution between machines

ggplot(machine_performance %>% filter(DIF_PERFORMANCE != 0)) +
  geom_col(aes(x = MACHINE,
               y = DIF_PERFORMANCE,
               fill = factor(STATUS_EVOLUTION, labels = c('Worse', 'Better'))), alpha = 0.5) +
  ggtitle('EVOLUTION OF PERFORMANCE BY MACHINE') +
  labs(fill = 'Situation') +
  geom_text(aes(x = MACHINE, y = DIF_PERFORMANCE, label = round(DIF_PERFORMANCE, 2)), vjust = -0.5, size = 3) +
  theme(plot.title = element_text(vjust = 0.5, hjust = 0.5, face = 'bold'),
        legend.position = 'bottom',
        legend.title = element_text(face = 'bold'))

Between the machines with production before and after changes, three of them got worse: BA_13, BA_14, CA_06 and DT_12.
But for the whole department, the most machines improved.

List of machines which has gotten worse

worse_machines <- as.vector(unlist(machine_performance %>% filter(DIF_PERFORMANCE < 0) %>% select(MACHINE)))

List of machines which has improved

best_machines <- as.vector(unlist(machine_performance %>% filter(DIF_PERFORMANCE > 0) %>% select(MACHINE)))

Checking what machines have produced the worse products

demand_by_machine_worse_products <- dcast(distinct(production_failures %>%
           filter(PRODUCT %in% worse_products) %>%
           group_by(PRODUCT, MACHINE, VERSION) %>%
           mutate(PRODUCTION = sum(LENGTH)) %>%
           select(PRODUCT, MACHINE, VERSION, PRODUCTION),
         MACHINE, VERSION, .keep_all = T),
      MACHINE + PRODUCT ~ VERSION)

## Using 'PRODUCTION' as value column. Use 'value.var' to override

demand_by_machine_worse_products[is.na(demand_by_machine_worse_products)] <- 0
demand_by_machine_worse_products %>% mutate(DIF_DEMAND = if_else(BEFORE > AFTER, 'REDUCED', 'INCREASED'))

##    MACHINE PRODUCT BEFORE AFTER DIF_DEMAND
## 1    BA_01  TYPE A  20290     0    REDUCED
## 2    BA_04  TYPE A   9360     0    REDUCED
## 3    BA_05  TYPE A   6835     0    REDUCED
## 4    BA_06  TYPE A  24036     0    REDUCED
## 5    BA_07  TYPE A  40180  8073    REDUCED
## 6    BA_08  TYPE A  40334 11267    REDUCED
## 7    BA_09  TYPE A  16851     0    REDUCED
## 8    BA_10  TYPE A  25855     0    REDUCED
## 9    BA_12  TYPE A  15908  4800    REDUCED
## 10   BA_15  TYPE A  11300 10462    REDUCED
## 11   DT_11  TYPE A  36457     0    REDUCED
## 12   DT_12  TYPE A  83804 29557    REDUCED

As checked above, all the machines that produced the worse products have reduced the demand.
There are at least two basic conclusions:
1st: The machines have not to do with the descreasement of the mean of performance
2nd: As shorter the products are, as more it’s fails

To test this second one, it’s important to check if the demand by machine of the best products have increased.

demand_by_machine_best_products <- dcast(distinct(production_failures %>%
                                                    filter(PRODUCT %in% best_products) %>%
                                                    group_by(PRODUCT, MACHINE, VERSION) %>%
                                                    mutate(PRODUCTION = sum(LENGTH)) %>%
                                                    select(PRODUCT, MACHINE, VERSION, PRODUCTION),
                                                  MACHINE, VERSION, .keep_all = T),
                                         MACHINE + PRODUCT ~ VERSION)

## Using 'PRODUCTION' as value column. Use 'value.var' to override

demand_by_machine_best_products[is.na(demand_by_machine_best_products)] <- 0
demand_by_machine_best_products <- demand_by_machine_best_products %>%
  mutate(DIF_DEMAND = if_else(BEFORE > AFTER, 'REDUCED', 'INCREASED'))
kable(table(demand_by_machine_best_products$DIF_DEMAND))

Var1	Freq
INCREASED	9
REDUCED	12

In general, the most cases show that the demand has decreased.

But to assume it’s true, it’s important to measure the difference of length mean.

demand_by_machine_best_products <- dcast(distinct(production_failures %>%
                                                    filter(PRODUCT %in% best_products) %>%
                                                    group_by(PRODUCT, MACHINE, ID, VERSION) %>%
                                                    mutate(PRODUCTION = mean(LENGTH)) %>%
                                                    select(PRODUCT, MACHINE, ID, VERSION, PRODUCTION),
                                                  MACHINE, VERSION, .keep_all = T),
                                         MACHINE + ID + PRODUCT ~ VERSION)

## Using 'PRODUCTION' as value column. Use 'value.var' to override

kable(demand_by_machine_best_products) %>%
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed'),
                font_size = 9)

MACHINE	ID	PRODUCT	BEFORE	AFTER
BA_02	11-1078/90	TYPE B	NA	1020
BA_02	11-1078/91	TYPE B	NA	1014
BA_08	11-0966/93	TYPE B	NA	513
BA_09	10-2828/90	TYPE B	286	NA
BA_09	11-0966/90	TYPE B	NA	1020
BA_09	11-0966/91	TYPE B	NA	1020
BA_09	11-0966/92	TYPE B	NA	1020
BA_09	11-1087/90	TYPE B	NA	1122
BA_09	11-1087/91	TYPE B	NA	1032
BA_10	11-0975/90	TYPE B	1020	NA
BA_10	11-0975/91	TYPE B	NA	1020
BA_10	11-0975/92	TYPE B	1020	NA
BA_10	11-0975/93	TYPE B	547	NA
BA_13	11-1018/90	TYPE B	1020	NA
BA_13	11-1018/91	TYPE B	NA	1007
BA_14	11-0192/90	TYPE C	1017	NA
BA_14	11-0192/91	TYPE C	1203	NA
BA_17	11-1088/90	TYPE C	NA	1043
CA_05	10-2828/91	TYPE B	519	NA
CA_05	11-0191/90	TYPE C	1020	NA
CA_05	11-0191/91	TYPE C	1031	NA
CA_06	10-2803/90	TYPE C	1010	NA
CA_06	11-1020/90	TYPE C	NA	1020
CA_06	11-1020/91	TYPE C	NA	1137
CA_06	11-1090/90	TYPE B	1122	NA
CA_06	11-1090/91	TYPE B	NA	1077
CA_08	11-0958/90	TYPE B	NA	1020
CA_08	11-0958/91	TYPE B	NA	1006
CA_08	11-0969/90	TYPE B	1020	NA
CA_08	11-0969/91	TYPE B	1020	NA
CA_08	11-1080/90	TYPE B	480	NA
CA_08	11-1080/91	TYPE B	916	NA
CA_10	11-1017/90	TYPE B	1020	NA
CA_10	11-1017/91	TYPE B	989	NA
CA_10	11-1019/90	TYPE C	NA	1020
CA_10	11-1019/91	TYPE C	NA	1175
CA_11	11-1015/90	TYPE B	NA	1020
CA_11	11-1015/91	TYPE B	1025	NA
CA_12	11-1026/90	TYPE C	NA	1020
CA_12	11-1026/92	TYPE C	NA	1195
CA_12	11-1093/90	TYPE B	NA	1122
CA_12	11-1093/91	TYPE B	NA	892
CA_13	10-2019/91	TYPE B	1075	NA
CA_15	10-2805/90	TYPE C	1002	NA
DT_11	11-0956/90	TYPE B	NA	136
DT_11	11-0956/91	TYPE B	NA	884
DT_11	11-0956/92	TYPE B	NA	1020
DT_11	11-0956/93	TYPE B	NA	1526
DT_11	11-0973/90	TYPE B	NA	940
DT_11	11-0973/91	TYPE B	NA	1020
DT_11	11-0973/92	TYPE B	NA	1020
DT_11	11-0973/93	TYPE B	NA	587
DT_11	11-1089/90	TYPE C	1136	NA
DT_11	11-1095/90	TYPE C	825	NA

Calculating the difference between length mean before and after changes

mean(demand_by_machine_best_products$AFTER, na.rm = T)-mean(demand_by_machine_best_products$BEFORE, na.rm = T)

## [1] 62.20337

Even it’s shown that the products have incresead the length by unit, 62 meters aren’t enough to affirm that the length has impact to the number of failures.
To finally check if there are a correlation between the length of products and number of failures, it will be calculated the correlation coefficient.

ggplot(production_failures) +
  geom_point(aes(x = LENGTH, y = TOTAL), fill = 'lightblue', color = 'darkblue', alpha = 0.7) +
  ggtitle('CORRELATION BETWEEN LENGTH OF PRODUCT AND FAILURES OCCURRENCES') +
  labs(subtitle = paste('Correlation coefficient',
                        round(cor(production_failures$LENGTH,
                            production_failures$TOTAL,
                            method = c('pearson', 'kendall', 'spearman')),2))) +
  theme(plot.title = element_text(vjust = 0.5, hjust = 0.5, face='bold'),
        plot.subtitle = element_text(vjust = 0.5, hjust = 0.5))

With a correlation coefficient close to 0, it’s proved that there aren’t a a strong correlation between the length of the product with the number of failures.
So, at this point, the first suggestion that the machines have not to do with the performance decreasement of the product TYPE A is being considered true, and the second one isn’t comproved.

ANALYSIS OF FREQUENCE OF FAILURES BY SECTIONS OF THE PRODUCT.

Since the product can be measured by length, by analysing the number of failures per secction can bring up some insights that could help to understand if the failures occur more at the begin of the proccess, at the middle of the proccess or at the end of the proccess.

Calculating the occurrences by secction of the product

Considering that the failures ID has a letter at the end, and this letters represent the secction of the product in a sequential order (A = begin, B = 2nd part, C = 3rd part, …) it may be used to calculate if the products have a tendence of more failures in a specific secction.

failures_by_section <- failures_original
failures_by_section <- failures_by_section %>% mutate(ID = str_extract(ID, '\\w$'))

Checking unique values

unique(failures_by_section$ID)

##  [1] "D" "C" "B" "A" "E" "F" "H" "I" "1" "J" "G" "0" "K"

0 and 1 should not be part of this analysis, because it’s due to products that aren’t splitted

failures_by_section <- failures_by_section %>% filter(ID != '0' & ID != '1')

Checking the frequency by secction

kable(table(failures_by_section$ID))

Var1	Freq
A	159
B	133
C	128
D	130
E	31
F	16
G	16
H	23
I	14
J	18
K	5

The secctions between A and D has the most number of occurrences. It regards to the most products are splitted into only four secctions.
To guarantee a better annalysis, it’s being considered to discard secctions above D

failures_by_section <- failures_by_section %>% filter(
  ID == 'A' | ID == 'B' | ID == 'C' | ID == 'D'
)

Counting the occurrence of failures by secction

failures_by_section <- distinct(
  failures_by_section %>%
    group_by(ID) %>%
    mutate(FAILURES = sum(TOTAL)) %>%
    select(ID, FAILURES),
  ID, .keep_all = T)

# Plotting the failures by secction
ggplot(failures_by_section) + 
  geom_col(aes(x = ID, y = FAILURES), fill = 'lightblue', color = 'darkblue', alpha = 0.7) +
  ggtitle('FAILURES BY SECCTION') +
  geom_text(aes(x = ID, y = FAILURES, label = FAILURES), vjust = -0.5, size = 3) +
  theme(plot.title = element_text(vjust = 0.5, hjust = 0.5, face = 'bold'))

As shown in the plot in general the most failures are at the begin of the products.
At this point it will be considered that it important to check if the setup of the machines have to do with this occurrences.

Checking if the performance has improved with the changes

To consider if the performance has improved, it’s important to select machines that have productions before and after the changes.

performance_comparison <- machine_performance %>%
  filter(STATUS_EVOLUTION == 1) %>%
  select(MACHINE, BEFORE, AFTER)

Testing the normality of the distribution of performance before and after changes

shapiro.test(performance_comparison$BEFORE)

## 
##  Shapiro-Wilk normality test
## 
## data:  performance_comparison$BEFORE
## W = 0.92412, p-value = 0.3926

shapiro.test(performance_comparison$AFTER)

## 
##  Shapiro-Wilk normality test
## 
## data:  performance_comparison$AFTER
## W = 0.97018, p-value = 0.8925

Since the both distribution was tested as normal, and the machines are the same, so the hypothesis test that will be reproduced is t test for parametric statistic.

The alternative hypothesis is: (mean of AFTER - mean of BEFORE) > 0
The confidence level is: 0.95

t.test(performance_comparison$AFTER, performance_comparison$BEFORE,
       alternative = 'greater',
       paired = T)

## 
##  Paired t-test
## 
## data:  performance_comparison$AFTER and performance_comparison$BEFORE
## t = 5.3593, df = 9, p-value = 0.0002284
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  132.0073      Inf
## sample estimates:
## mean of the differences 
##                200.6317

With a p-value reaching less than 0.05 the alternative hypothesis has been accepted, confirming that thechanges have made the difference in the process.

Plotting the results

performance_comparison <- melt(performance_comparison,
     id.vars = c('MACHINE'),
     variable.name = 'VERSION',
     value.name = 'PERFORMANCE',
     variable.factor = T)

ggplot(performance_comparison, aes(x = VERSION, y = PERFORMANCE, fill = VERSION)) +
  geom_boxplot(alpha = 0.5) +
  ggtitle('PERFORMANCE BEFORE AND AFTER CHANGES IN THE PROCESS') +
  stat_summary(fun.y = mean, colour = 'red', geom = 'point', shape = 3, size = 2) +
  theme(plot.title = element_text(vjus = 0.5, hjust = 0.5, face = 'bold'),
        legend.position = 'bottom',
        legend.title = element_text(face = 'bold'))

FINAL CONCLUSIONS

1 - There’s a evidence that probably the changes have contributed to reduce the frequency of the failures 4, 3, 2, 5 and 17.
2 - The failures aren’t centered in specific machines.
3 - The most failures are concentrated at the beginning of the products.
4 - The changes have done difference and contributed to improve the performance.