You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Sorry for the late reply,
These are the three code snippets I wrote before, run them in orderI hope it will be useful to you! @huhui ,@arunbaruah ,@nagsubhadeep, @Magical66
1.
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 23 10:54:57 2019
@author: lidongxu1
"""
import re
import spacy
import json
def data_read(filepath):
fp = open(filepath, "r")
datas = [] # 存储处理后的数据
lines = fp.readlines() # 读取整个文件数据
i = 0 # 为一行数据
for line in lines:
row = line.strip('\n') # 去除两头的换行符,按空格分割
datas.append(row)
i = i + 1
fp.close()
return datas
def camel_to_snake(name):
"""
# To handle more advanced cases specially (this is not reversible anymore):
# Ref: https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case
"""
name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()
def replace_all_blank(value):
"""
去除value中的所有非字母内容,包括标点符号、空格、换行、下划线等
:param value: 需要处理的内容
:return: 返回处理后的内容
# https://juejin.im/post/5d50c132f265da03de3af40b
# \W 表示匹配非数字字母下划线
"""
result = re.sub('\W+', ' ', value).replace("_", ' ')
result = re.sub('\d',' ',result)
return result
# https://github.com/explosion/spaCy
# https://github.com/hamelsmu/Seq2Seq_Tutorial/issues/1
nlp = spacy.load('en_core_web_sm')
def lemmatize_stop(text):
"""
https://stackoverflow.com/questions/45605946/how-to-do-text-pre-processing-using-spacy
"""
# nlp = spacy.load('en_core_web_sm')
document = nlp(text)
# lemmas = [token.lemma_ for token in document if not token.is_stop]
lemmas = [token.text for token in document if not token.is_stop]
return lemmas
def dump_2_json(dump_dict, target_path):
'''
:param dump_dict: submits dict
:param target_path: json dst save path
:return:
'''
class MyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, bytes):
return str(obj, encoding='utf-8')
return json.JSONEncoder.default(self, obj)
file = open(target_path, 'w', encoding='utf-8')
file.write(json.dumps(dump_dict, cls=MyEncoder, indent=4))
file.close()
data = data_read('template.txt')
result = {}
for i in range(len(data)):
temp = data[i]
temp = camel_to_snake(temp)
temp = replace_all_blank(temp)
temp = " ".join(temp.split())
temp = lemmatize_stop(temp)
result[i] = temp
print(result)
dump_2_json(result, 'eventid2template.json')
# 单独保存需要用到的fasttext词向量
template_set = set()
for key in result.keys():
for word in result[key]:
template_set.add(word)
import io
from tqdm import tqdm
# https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md
def load_vectors(fname):
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())
data = {}
for line in tqdm(fin):
tokens = line.rstrip().split(' ')
data[tokens[0]] = map(float, tokens[1:])
return data
fasttext = load_vectors('cc.en.300.vec')
template_fasttext_map = {}
for word in template_set:
template_fasttext_map[word] = list(fasttext[word])
dump_2_json(template_fasttext_map,'fasttext_map.json')
import os
import json
import numpy as np
import pandas as pd
from collections import Counter
import math
def read_json(filename):
with open(filename, 'r') as load_f:
file_dict = json.load(load_f)
return file_dict
eventid2template = read_json('eventid2template.json')
fasttext_map = read_json('fasttext_map.json')
print(eventid2template)
dataset = list()
with open('data/'+'deepLog_hdfs_train.txt', 'r') as f:
for line in f.readlines():
line = tuple(map(lambda n: n - 1, map(int, line.strip().split())))
dataset.append(line)
print(len(dataset))
idf_matrix = list()
for seq in dataset:
for event in seq:
idf_matrix.append(eventid2template[str(event)])
print(len(idf_matrix))
idf_matrix = np.array(idf_matrix)
X_counts = []
for i in range(idf_matrix.shape[0]):
word_counts = Counter(idf_matrix[i])
X_counts.append(word_counts)
print(X_counts[1000])
X_df = pd.DataFrame(X_counts)
X_df = X_df.fillna(0)
print(len(X_df))
print(X_df.head())
events = X_df.columns
print(events)
X = X_df.values
num_instance, num_event = X.shape
print('tf-idf here')
df_vec = np.sum(X > 0, axis=0)
print(df_vec)
print('*'*20)
print(num_instance)
# smooth idf like sklearn
idf_vec = np.log((num_instance + 1) / (df_vec + 1)) + 1
print(idf_vec)
idf_matrix = X * np.tile(idf_vec, (num_instance, 1))
X_new = idf_matrix
print(X_new.shape)
print(X_new[1000])
word2idf = dict()
for i,j in zip(events,idf_vec):
word2idf[i]=j
# smooth idf when oov
word2idf['oov'] = (math.log((num_instance + 1) / (29+1)) + 1)
print(word2idf)
def dump_2_json(dump_dict, target_path):
'''
:param dump_dict: submits dict
:param target_path: json dst save path
:return:
'''
class MyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, bytes):
return str(obj, encoding='utf-8')
return json.JSONEncoder.default(self, obj)
file = open(target_path, 'w', encoding='utf-8')
file.write(json.dumps(dump_dict, cls=MyEncoder, indent=4))
file.close()
dump_2_json(word2idf,'word2idf.json')
import json
import numpy as np
from collections import Counter
def read_json(filename):
with open(filename, 'r') as load_f:
file_dict = json.load(load_f)
return file_dict
event2template = read_json('eventid2template.json')
fasttext = read_json('fasttext_map.json')
word2idf = read_json('word2idf.json')
event2semantic_vec = dict()
# todo :
# 计算每个seq的tf,然后计算句向量
for event in event2template.keys():
template = event2template[event]
tem_len = len(template)
count = dict(Counter(template))
for word in count.keys():
# TF
TF = count[word]/tem_len
# IDF
IDF = word2idf.get(word,word2idf['oov'])
# print(word)
# print(TF)
# print(IDF)
# print('-'*20)
count[word] = TF*IDF
# print(count)
# print(sum(count.values()))
value_sum = sum(count.values())
for word in count.keys():
count[word] = count[word]/value_sum
semantic_vec = np.zeros(300)
for word in count.keys():
fasttext_weight = np.array(fasttext[word])
semantic_vec += count[word]*fasttext_weight
event2semantic_vec[event] = list(semantic_vec)
def dump_2_json(dump_dict, target_path):
'''
:param dump_dict: submits dict
:param target_path: json dst save path
:return:
'''
class MyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, bytes):
return str(obj, encoding='utf-8')
return json.JSONEncoder.default(self, obj)
file = open(target_path, 'w', encoding='utf-8')
file.write(json.dumps(dump_dict, cls=MyEncoder, indent=4))
file.close()
dump_2_json(event2semantic_vec,'event2semantic_vec_sameoov.json')
Originally posted by @donglee-afar in #3 (comment)
The text was updated successfully, but these errors were encountered:
FLxuRu
changed the title
Sorry for the late reply,
请问作者,data_read('template.txt')中template.txt文件是怎么得到的?
Apr 10, 2021
FLxuRu
changed the title
请问作者,data_read('template.txt')中template.txt文件是怎么得到的?
请问作者,data_read('template.txt')中template.txt文件是怎么得到的?第二个脚本里deepLog_hdfs_train.txt文件在data文件夹下也没看到
Apr 10, 2021
Sorry for the late reply,
These are the three code snippets I wrote before, run them in orderI hope it will be useful to you!
@huhui ,@arunbaruah ,@nagsubhadeep, @Magical66
1.
Originally posted by @donglee-afar in #3 (comment)
The text was updated successfully, but these errors were encountered: