import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('https://raw.githubusercontent.com/numenta/NAB/master/data/realKnownCause/machine_temperature_system_failure.csv')

# Convert the timestamp column to a datetime object
df['timestamp'] = pd.to_datetime(df['timestamp'])

def moving_average(df):
  # Calculate the moving average of the temperature readings
  window_size = 200 # MODIFICATION, 50
  ma = df['value'].rolling(window_size).mean()

  # Calculate the deviation from the moving average
  deviation = df['value'] - ma

  # Calculate the standard deviation of the deviation
  std_deviation = deviation.rolling(window_size).std()

  # Calculate the threshold for anomaly detection
  threshold = 3 * std_deviation

  # Detect anomalies based on deviations from the moving average
  anomalies = df[deviation.abs() > threshold]
  return [anomalies,ma,threshold]

def plot(df,anomalies, mean,threshold):
  # Plot the temperature readings and the anomalies
  plt.subplots(figsize=(14, 10))
  plt.plot(df['timestamp'], df['value'], color='blue', label='Temperature Readings')
  plt.scatter(anomalies['timestamp'], anomalies['value'], color='red', label='Anomalies')
  plt.fill_between(df['timestamp'], mean-threshold, mean+threshold, color='gray', alpha=0.2, label='Threshold')
  plt.legend()
  plt.title('Machine Temperature Anomaly Detection')
  plt.xlabel('Date')
  plt.ylabel('Temperature (Celsius)')
  plt.grid()
  plt.show()

anomalies,ma,threshold = moving_average(df)
plot(df,anomalies,ma,threshold)

import numpy as np

def calculate_zscore(df):
  # Calculate the average of the temperature readings
  mean = df['value'].mean()

  # Calculate the deviation from the average
  deviation = df['value'] - mean

  # Calculate the standard deviation of the deviation
  std_deviation = deviation.std() # same as we would calculate df['value'].std()

  # Calculate Z-score - we don't use it directly, just for reference
  z_score = deviation / std_deviation # z_score = (value-mean)/std

  # Calculate the threshold for anomaly detection
  # equivalent to saying: threshold = 3 * std_deviation
  threshold = 3

  # Detect anomalies based on deviations from the moving average
  #anomalies = df[deviation.abs() > threshold]
  anomalies = df[np.abs(z_score) > threshold]
  return [anomalies,mean,threshold*std_deviation]

anomalies,mean,threshold = calculate_zscore(df)
plot(df,anomalies,mean,threshold)

def percentile_analysis(df):
  # calculate percentile limits
  percent_limit = 0.3 # 99.7% is set
  upper_threshold = df['value'].quantile((100-percent_limit/2)/100)
  lower_threshold = df['value'].quantile(percent_limit/2/100)

  # Detect anomalies based on percentile limits
  anomalies_upper = df[df['value'] > upper_threshold]
  anomalies_lower = df[df['value'] < lower_threshold]

  # Plot the temperature readings and the anomalies
  plt.subplots(figsize=(14, 10))
  plt.plot(df['timestamp'], df['value'], color='blue', label='Temperature Readings')
  plt.scatter(anomalies_upper['timestamp'], anomalies_upper['value'], color='cyan', label='Upper anomalies')
  plt.scatter(anomalies_lower['timestamp'], anomalies_lower['value'], color='red', label='Lower anomalies')
  plt.fill_between(df['timestamp'], lower_threshold, upper_threshold, color='gray', alpha=0.2, label='Threshold')
  plt.legend()
  plt.title('Machine Temperature Anomaly Detection - Percentiles')
  plt.xlabel('Date')
  plt.ylabel('Temperature (Celsius)')
  plt.grid()
  plt.show()

percentile_analysis(df)

import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('https://raw.githubusercontent.com/numenta/NAB/master/data/realKnownCause/machine_temperature_system_failure.csv')

# Convert the timestamp column to a datetime object
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Set the timestamp column as the index and convert to a series
series = df.set_index('timestamp')['value'].squeeze()

import pandas as pd
from sklearn.ensemble import IsolationForest

def detect_anomalies_with_isolation_forest(series):
    # Convert the series to a 2D NumPy array
    data = series.values.reshape(-1, 1)

    # Create an instance of IsolationForest class
    model = IsolationForest(n_estimators=100, contamination=0.01, random_state=45)

    # Fit model to the data
    model.fit(data)
    # predict anomalies
    anomalies = model.predict(data)

    # Convert predictions to series
    anomalies_series = pd.Series(anomalies, index=series.index)
    return anomalies_series

# Detect anomalies using the Isolation Forest algorithm
anomalies = detect_anomalies_with_isolation_forest(series)

# Plot the original series and the detected anomalies
plt.subplots(figsize=(14, 10))
plt.plot(df['timestamp'], df['value'], color='blue', label='Temperature Readings')
plt.scatter(anomalies[anomalies==-1].index, series[anomalies==-1].values, color='red', label='Anomalies')
plt.legend()
plt.title('Machine Temperature Anomaly Detection - Isolation Forest ')
plt.xlabel('Date')
plt.ylabel('Temperature (Celsius)')
plt.grid()
plt.show()

import pandas as pd
from sklearn.neighbors import LocalOutlierFactor

def detect_anomalies_with_local_outlier(series):
    lof = LocalOutlierFactor(n_neighbors=40, contamination=0.01)
    X = series.values.reshape(-1,1)
    y_pred = lof.fit_predict(X)
    anomalies = X[y_pred==-1]
    return pd.Series(anomalies.flatten(), index=series.index[y_pred==-1])

# Detect anomalies using the Isolation Forest algorithm
anomalies = detect_anomalies_with_local_outlier(series)

# Plot the original series and the detected anomalies
plt.subplots(figsize=(14, 10))
plt.plot(df['timestamp'], df['value'], color='blue', label='Temperature Readings')
plt.scatter(anomalies.index, anomalies.values, color='red', label='Anomalies')
plt.legend()
plt.title('Machine Temperature Anomaly Detection - Local Outlier Factor')
plt.xlabel('Date')
plt.ylabel('Temperature (Celsius)')
plt.grid()
plt.show()

train_series = series.loc['2013-12-18':'2014-01-26']
test_series = series # Run on all data

min_val = series.min()
max_val = series.max()

# Plot the original series and the detected anomalies
plt.subplots(figsize=(14, 10))
plt.plot(df['timestamp'], df['value'], color='blue', label='Temperature Readings')
plt.plot(train_series.index, train_series.values, color='green', label='training data')
plt.fill_between(train_series.index, min_val, max_val, color='gray', alpha=0.2, label='Training region')
plt.legend()
plt.title('Partitioning data to training and test data')
plt.xlabel('Date')
plt.ylabel('Temperature (Celsius)')
plt.grid()
plt.show()

from tensorflow import keras

def build_autoencoder(window_size=20, latent_dim=3):
    """
    Build a simple autoencoder
    """
    # Define the autoencoder architecture
    model = keras.Sequential([
        keras.layers.Dense(64, activation='relu', input_shape=(window_size,)),
        keras.layers.Dense(latent_dim, activation='relu'),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dense(window_size, activation='linear')
    ])
    return model

from tensorflow import keras

def train_autoencoder(train_series, model, window_size=20, epochs=100):
    """
    Train the  autoencoder
    """
    # Prepare the input data
    X = []
    for i in range(len(train_series) - window_size):
        X.append(train_series[i:i+window_size])
    X = np.array(X)

    # Train the autoencoder
    model.compile(optimizer='adam', loss='mse')
    model.fit(X, X, epochs=epochs, verbose=0)
    return model

import numpy as np
import pandas as pd
from tensorflow import keras

def detect_anomalies(test_series, model, window_size=20):
    # Prepare the input data
    X = []
    for i in range(len(test_series) - window_size):
        X.append(test_series[i:i+window_size])
    X = np.array(X)

    # Use the trained autoencoder to detect anomalies
    X_pred = model.predict(X)
    mse = np.mean(np.power(X - X_pred, 2), axis=1)
    threshold = np.percentile(mse, 95)
    anomalies = test_series.iloc[window_size:][mse >= threshold]

    return anomalies

# Detect anomalies using Autoencoder - 2

# Select training data
train_series = series.loc['2013-12-18':'2014-01-26']
test_series = series # Run on all data

min_val = series.min()
max_val = series.max()

# Set autoencoder parameters
window_size = 150
latent_dim = 3
epochs = 100

# Build a simple autoencoder
model = build_autoencoder(window_size, latent_dim)

# Train the  autoencoder
model = train_autoencoder(train_series, model, window_size, epochs)

# Detect anomalies using the Isolation Forest algorithm
anomalies = detect_anomalies(test_series, model, window_size)

# Plot the original series and the detected anomalies
plt.subplots(figsize=(14, 10))
plt.plot(df['timestamp'], df['value'], color='blue', label='Temperature Readings')
plt.scatter(anomalies.index, anomalies.values, color='red', label='Anomalies')
plt.fill_between(train_series.index, min_val, max_val, color='gray', alpha=0.2, label='Training region')
plt.legend()
plt.title('Machine Temperature Anomaly Detection - Autoencoder')
plt.xlabel('Date')
plt.ylabel('Temperature (Celsius)')
plt.grid()
plt.show()

705/705 [==============================] - 1s 1ms/step

import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('https://raw.githubusercontent.com/numenta/NAB/master/data/realKnownCause/machine_temperature_system_failure.csv')

# Convert the timestamp column to a datetime object
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Set the timestamp column as the index and convert to a series
series = df.set_index('timestamp')['value'].squeeze()

import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Step 1: Plot the data
plt.figure(figsize=(12, 6))
plt.plot(series.index, series.values)
plt.title('Time Series Data')
plt.xlabel('Time')
plt.ylabel('Value')
plt.show()

# Step 2: Perform Augmented Dickey-Fuller test
result = adfuller(series)
print('ADF Statistic:', result[0])
print('p-value:', result[1])
print('Critical Values:', result[4])
if result[1] <= 0.05:
    print('The series is stationary.')
else:
    print('The series is not stationary.')

# Step 3: Plot ACF and PACF
plot_acf(series, lags=20)
plt.title('Autocorrelation Function')
plt.show()

plot_pacf(series, lags=20)
plt.title('Partial Autocorrelation Function')
plt.show()

ADF Statistic: -7.3690895644913414
p-value: 9.071345315518266e-11
Critical Values: {'1%': -3.4306384378792365, '5%': -2.861667480233377, '10%': -2.566837854101014}
The series is stationary.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA

min_val = series.min()
max_val = series.max()

# Split the data into training and testing sets
train_series = series.loc['2013-12-18':'2014-01-26']
test_series = series.loc['2014-01-22':] # some overlapping

# ---------------------- CHATGPT ---------------------- :
# Fit an ARIMA model to the training data
#model = ARIMA(train_series.values, order=(3,1,1))
model = ARIMA(train_series.values, order=(1,0,0))
model_fit = model.fit()

# Make predictions on the testing data
predictions = model_fit.forecast(steps=len(test_series))[0] # ORIGINAL - resulting a single step prediction
predictions = model_fit.forecast(steps=len(test_series))

# Calculate the residuals between the predicted and actual values
residuals = test_series.values - predictions

# Calculate the mean and standard deviation of the residuals
residual_mean = np.mean(residuals)
residual_std = np.std(residuals)

# Determine the threshold for anomaly detection
threshold = residual_mean + 2.5 * residual_std

# Identify anomalies in the testing data
anomalies = test_series[abs(residuals) > threshold]
# ---------------------- CHATGPT ----------------------

# Plot the original series and the detected anomalies
plt.subplots(figsize=(14, 10))
plt.plot(df['timestamp'], df['value'], color='blue', label='Temperature Readings')
plt.scatter(anomalies.index, anomalies.values, color='red', label='Anomalies')
plt.fill_between(train_series.index, min_val, max_val, color='gray', alpha=0.2, label='Training region')
plt.fill_between(test_series.index, min_val, max_val, color='red', alpha=0.15, label='Test region')
#plt.plot(test_series.index, predictions, color='green', label='Predictions')
plt.legend()
plt.title('Machine Temperature Anomaly Detection - ARIMA')
plt.xlabel('Date')
plt.ylabel('Temperature (Celsius)')
plt.grid()
plt.show()

Anomaly Detection¶

Statistical Techniques¶

Moving Average (MA)¶

Z-Score Analysis¶

Percentile Analysis¶

Classical Machine Learning Algorithms¶

Isolation Forest¶

Local Outlier Factor¶

Autoencoder algorithm¶

Time Series Specific Methods¶

ARIMA¶