import numpy as np
from datetime import datetime
from pandas import read_table
import matplotlib.pyplot as plt
import pandas as pd
str_path = 'your data path'
cols = ['j', 'u', 'v', 'temp', 'sal', 'y', 'mn', 'd', 'h', 'mi']
df = read_table(str_path+'spikey_v.dat' , delim_whitespace=True, names=cols)
df.index = [datetime(*x) for x in zip(df['y'], df['mn'], df['d'], df['h'], df['mi'])]
df = df.drop(['y', 'mn', 'd', 'h', 'mi'], axis=1)
data = np.full(len(df), np.nan)
for i in range(len(df)):
data[i] = df['u'][i]
plt.plot(data)
plt.grid()
Outlier_parameters = {
"AUTO": 1,
}
df = pd.concat([pd.DataFrame({"ID":list(range(len(data)))}),pd.DataFrame(data)],axis=1)
dfOutlierResults, dfStats, dfMetri = OutlierDetectionForTimeSeries(df,Outlier_parameters,cc)
dfOutlierResults
dfStats
outlier_result_plot(dfOutlierResults)
outlier_plot(dfOutlierResults)
From the statistic table, we can see that the final smoothing method is median filter, as the time series is quite smooth. From the above results, we find that PAL miss detecting an outlier. This is because there are two big outliers here and the standard deviation becomes large. From the plots of residual and outlier score, we can find four outliers very clearly. We can adjust the threshold or choose other methods in step 2 to find all outliers.
str_path = 'your data path'
df = pd.read_csv(str_path+'daily_csv_no_missing_value.csv')
num = len(df)
data = np.full(num,np.nan)
for i in range(num):
data[i] = df['Price'][i]
plt.plot(data)
plt.grid()
Outlier_parameters = {
"AUTO": 1,
}
df = pd.concat([pd.DataFrame({"ID":list(range(len(data)))}),pd.DataFrame(data)],axis=1)
dfOutlierResults, dfStats, dfMetri = OutlierDetectionForTimeSeries(df,Outlier_parameters,cc)
dfOutlierResults
dfStats
outlier_result_plot(dfOutlierResults)
outlier_plot(dfOutlierResults)
import math
random_seed = 3
np.random.seed(random_seed)
cols = 200 # length of time series
cycle = 40
outlier_idx = [30, 45, 73, 126, 159, 173]
timestamp = np.full(cols,np.nan,dtype = int)
for i in range(cols):
timestamp[i] = i
seasonal = np.full(cols,np.nan,dtype = float)
for i in range(cols):
seasonal[i] = math.sin(2*math.pi/cycle*timestamp[i])
const = np.full(cols,2,dtype = float)
noise = np.full(cols,0,dtype = float)
for i in range(cols):
noise[i] = 0.2*(np.random.rand()-0.5)
trend = 0.01*timestamp
outlier = np.full(cols,0,dtype = float)
for i in outlier_idx:
outlier[i] = 4*(np.random.rand()-0.5)
data = seasonal + const + noise + trend + outlier
plt.plot(data)
plt.grid()
Outlier_parameters = {
"AUTO": 1,
}
df = pd.concat([pd.DataFrame({"ID":list(range(len(data)))}),pd.DataFrame(data)],axis=1)
dfOutlierResults, dfStats, dfMetri = OutlierDetectionForTimeSeries(df,Outlier_parameters,cc)
dfOutlierResults
dfStats
outlier_result_plot(dfOutlierResults)
outlier_plot(dfOutlierResults)
str_path = 'your data path'
df = pd.read_csv(str_path+'ice_cream_interest.csv')
data = np.full(len(df),np.nan,dtype = float)
for i in range(len(df)):
data[i] = df['interest'][i]
plt.plot(data)
plt.grid()
Outlier_parameters = {
"AUTO": 1,
}
df = pd.concat([pd.DataFrame({"ID":list(range(len(data)))}),pd.DataFrame(data)],axis=1)
dfOutlierResults, dfStats, dfMetri = OutlierDetectionForTimeSeries(df,Outlier_parameters,cc)
dfOutlierResults
dfStats
outlier_result_plot(dfOutlierResults)
outlier_plot(dfOutlierResults)
import hana_ml
from hana_ml import dataframe
conn = dataframe.ConnectionContext('host', 'port', 'username', 'password')
def createEmptyTable(table_name, proto, cc):
with cc.connection.cursor() as cur:
try:
joint = []
for key in proto:
joint.append(" ".join(['"{:s}"'.format(key), proto[key]]))
cur.execute('CREATE COLUMN TABLE %s (%s);' %
(table_name, ",".join(joint)))
except:
print(
f"\"CREATE TABLE {table_name}\" was unsuccessful. Maybe the table has existed.")
def dropTable(table_name, cc):
with cc.connection.cursor() as cur:
try:
cur.execute(f"DROP TABLE {table_name}")
except:
print(f"\"DROP TABLE {table_name}\" was unsuccessful. Maybe the table does not exist yet.")
def createTableFromDataFrame(df, table_name, cc):
dropTable(table_name, cc)
dt_ml = dataframe.create_dataframe_from_pandas(cc, df, table_name=table_name, table_structure={"MODEL_CONTENT":"NCLOB"})
# dt_ml = dataframe.create_dataframe_from_pandas(cc, df, table_name=table_name, table_structure={"COL1":"CLOB"})
return dt_ml
def OutlierDetectionForTimeSeries(df, parameters, cc,
data_table='ZPD_PAL_DATA_TBL',
parameter_table='ZPD_PAL_PARAMETERS_TBL',
result_table='ZPD_PAL_RESULT_TBL',
stats_table='ZPD_PAL_STATS_TBL',
metri_table='ZPD_PAL_METRI_TBL'):
# Input table
createTableFromDataFrame(df, data_table, cc)
# Result table
dropTable(result_table, cc)
createEmptyTable(result_table, {
"TIMESTAMP": "INTEGER","RAW_DATA":"DOUBLE","RESIDUAL":"DOUBLE","OUTLIER_SCORE":"DOUBLE","IS_OUTLIER":"INTEGER"}, cc)
# Metri table
dropTable(metri_table, cc)
createEmptyTable(metri_table, {
"STAT_NAME": "NVARCHAR(1000)","VALUE":"DOUBLE"}, cc)
# Stats table
dropTable(stats_table, cc)
createEmptyTable(stats_table, {
"STAT_NAME": "NVARCHAR(1000)", "STAT_VALUE": "NVARCHAR(1000)"}, cc)
# Parameter table
dropTable(parameter_table, cc)
createEmptyTable(parameter_table, {"PARAM_NAME": "nvarchar(256)", "INT_VALUE": "integer",
"DOUBLE_VALUE": "double", "STRING_VALUE": "nvarchar(1000)"}, cc)
if parameters:
with cc.connection.cursor() as cur:
for parName, parValue in parameters.items():
if isinstance(parValue, str):
parValue = f"'{parValue}'"
parametersSQL = f"{parValue if isinstance(parValue,int) else 'NULL'}, {parValue if isinstance(parValue,float) else 'NULL'}, { parValue if isinstance(parValue,str) else 'NULL'}"
cur.execute(
f"INSERT INTO {parameter_table} VALUES ('{parName}', {parametersSQL});")
elif isinstance(parValue,list):
for x in parValue:
if isinstance(x, str):
x = f"'{x}'"
parametersSQL = f"{x if isinstance(x,int) else 'NULL'}, {x if isinstance(x,float) else 'NULL'}, { x if isinstance(x,str) else 'NULL'}"
cur.execute(
f"INSERT INTO {parameter_table} VALUES ('{parName}', {parametersSQL});")
else:
parametersSQL = f"{parValue if isinstance(parValue,int) else 'NULL'}, {parValue if isinstance(parValue,float) else 'NULL'}, { parValue if isinstance(parValue,str) else 'NULL'}"
cur.execute(
f"INSERT INTO {parameter_table} VALUES ('{parName}', {parametersSQL});")
else:
print("No parameters given using default values.")
sql_str = f"\
do begin \
lt_data = select * from {data_table}; \
lt_control = select * from {parameter_table};\
CALL _SYS_AFL.PAL_OUTLIER_DETECTION_FOR_TIME_SERIES(:lt_data, :lt_control, lt_res, lt_stats, lt_metri); \
INSERT INTO {result_table} SELECT * FROM :lt_res; \
INSERT INTO {stats_table} SELECT * FROM :lt_stats;\
INSERT INTO {metri_table} SELECT * FROM :lt_metri; \
end;"
with cc.connection.cursor() as cur:
cur.execute(sql_str)
return cc.table(result_table).collect(), cc.table(stats_table).collect(), cc.table(metri_table).collect()
def outlier_result_plot(dResults):
dResults.sort_values(by = list(dResults)[0], inplace = True, ascending = True)
raw_data = np.array(dResults['RAW_DATA'])
residual = np.array(dResults['RESIDUAL'])
outlier_score = np.array(dResults['OUTLIER_SCORE'])
is_outlier = np.array(dResults['IS_OUTLIER'])
plt.figure(figsize = (24,4.5))
plt.subplot(1,4,1)
plt.plot(raw_data)
plt.grid()
plt.title('RAW_DATA')
plt.subplot(1,4,2)
plt.plot(residual)
plt.grid()
plt.title('RESIDUAL')
plt.subplot(1,4,3)
plt.plot(outlier_score)
plt.grid()
plt.title('OUTLIER_SCORE')
plt.subplot(1,4,4)
plt.plot(is_outlier)
plt.grid()
plt.title('IS_OUTLIER')
def outlier_plot(dResults):
dResults.sort_values(by = list(dResults)[0], inplace = True, ascending = True)
raw_data = np.array(dResults['RAW_DATA'])
is_outlier = np.array(dResults['IS_OUTLIER'])
outlier_idx = np.array([],dtype = int)
for i in range(len(is_outlier)):
if is_outlier[i] == 1:
outlier_idx = np.append(outlier_idx,i)
plt.plot(raw_data)
plt.scatter(outlier_idx,raw_data[outlier_idx],color = 'red')
plt.grid()
plt.title("series and outlier")
You must be a registered user to add a comment. If you've already registered, sign in. Otherwise, register and sign in.
User | Count |
---|---|
32 | |
14 | |
11 | |
10 | |
10 | |
9 | |
8 | |
8 | |
6 | |
6 |