请问作者，data_read('template.txt')中template.txt文件是怎么得到的？第二个脚本里deepLog_hdfs_train.txt文件在data文件夹下也没看到 #23

FLxuRu · 2021-04-10T02:43:10Z

Sorry for the late reply，
These are the three code snippets I wrote before, run them in orderI hope it will be useful to you！
@huhui ,@arunbaruah ,@nagsubhadeep, @Magical66
1.

# -*- coding: utf-8 -*-
"""
Created on Mon Dec 23 10:54:57 2019

@author: lidongxu1
"""
import re
import spacy
import json

def data_read(filepath):
    fp = open(filepath, "r")
    datas = []  # 存储处理后的数据
    lines = fp.readlines()  # 读取整个文件数据
    i = 0  # 为一行数据
    for line in lines:
        row = line.strip('\n') # 去除两头的换行符，按空格分割
        datas.append(row)
        i = i + 1   
    fp.close()
    return datas

def camel_to_snake(name):
    """
    # To handle more advanced cases specially (this is not reversible anymore):
    # Ref: https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case  
    """
    name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()


def replace_all_blank(value):
    """
    去除value中的所有非字母内容，包括标点符号、空格、换行、下划线等
    :param value: 需要处理的内容
    :return: 返回处理后的内容
    # https://juejin.im/post/5d50c132f265da03de3af40b
    # \W 表示匹配非数字字母下划线
    """
    result = re.sub('\W+', ' ', value).replace("_", ' ')
    result = re.sub('\d',' ',result)
    return result
# https://github.com/explosion/spaCy
# https://github.com/hamelsmu/Seq2Seq_Tutorial/issues/1
nlp = spacy.load('en_core_web_sm')
def lemmatize_stop(text):
    """
    https://stackoverflow.com/questions/45605946/how-to-do-text-pre-processing-using-spacy
    """
#    nlp = spacy.load('en_core_web_sm')
    document = nlp(text)
    # lemmas = [token.lemma_ for token in document if not token.is_stop]
    lemmas = [token.text for token in document if not token.is_stop]
    return lemmas

def dump_2_json(dump_dict, target_path):
    '''
    :param dump_dict: submits dict
    :param target_path: json dst save path
    :return:
    '''
    class MyEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, bytes):
                return str(obj, encoding='utf-8')
            return json.JSONEncoder.default(self, obj)

    file = open(target_path, 'w', encoding='utf-8')
    file.write(json.dumps(dump_dict, cls=MyEncoder, indent=4))
    file.close()

data = data_read('template.txt')
result = {}
for i in range(len(data)):
    temp = data[i]
    temp = camel_to_snake(temp)
    temp = replace_all_blank(temp)
    temp = " ".join(temp.split())
    temp = lemmatize_stop(temp)
    result[i] = temp
print(result)
dump_2_json(result, 'eventid2template.json')





# 单独保存需要用到的fasttext词向量
template_set = set()
for key in result.keys():
    for word in result[key]:
        template_set.add(word)

import io
from tqdm import tqdm

# https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in tqdm(fin):
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

fasttext = load_vectors('cc.en.300.vec')

template_fasttext_map = {}

for word in template_set:
    template_fasttext_map[word] = list(fasttext[word])
    

dump_2_json(template_fasttext_map,'fasttext_map.json')

import os
import json
import numpy as np
import pandas as pd
from collections import Counter
import math

def read_json(filename):
    with open(filename, 'r') as load_f:
        file_dict = json.load(load_f)
    return file_dict

eventid2template = read_json('eventid2template.json')
fasttext_map = read_json('fasttext_map.json')
print(eventid2template)
dataset = list()
with open('data/'+'deepLog_hdfs_train.txt', 'r') as f:
    for line in f.readlines():
        line = tuple(map(lambda n: n - 1, map(int, line.strip().split())))
        dataset.append(line)
print(len(dataset))
idf_matrix = list()
for seq in dataset:
    for event in seq:
        idf_matrix.append(eventid2template[str(event)])
print(len(idf_matrix))
idf_matrix = np.array(idf_matrix)
X_counts = []
for i in range(idf_matrix.shape[0]):
    word_counts = Counter(idf_matrix[i])
    X_counts.append(word_counts)
print(X_counts[1000])
X_df = pd.DataFrame(X_counts)
X_df = X_df.fillna(0)
print(len(X_df))
print(X_df.head())
events = X_df.columns
print(events)
X = X_df.values
num_instance, num_event = X.shape

print('tf-idf here')
df_vec = np.sum(X > 0, axis=0)
print(df_vec)
print('*'*20)
print(num_instance)
# smooth idf like sklearn
idf_vec = np.log((num_instance + 1)  / (df_vec + 1)) + 1
print(idf_vec)
idf_matrix = X * np.tile(idf_vec, (num_instance, 1))
X_new = idf_matrix
print(X_new.shape)
print(X_new[1000])

word2idf = dict()
for i,j in zip(events,idf_vec):
    word2idf[i]=j
    # smooth idf when oov
    word2idf['oov'] = (math.log((num_instance + 1)  / (29+1)) + 1)

print(word2idf)
def dump_2_json(dump_dict, target_path):
    '''
    :param dump_dict: submits dict
    :param target_path: json dst save path
    :return:
    '''
    class MyEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, bytes):
                return str(obj, encoding='utf-8')
            return json.JSONEncoder.default(self, obj)

    file = open(target_path, 'w', encoding='utf-8')
    file.write(json.dumps(dump_dict, cls=MyEncoder, indent=4))
    file.close()

dump_2_json(word2idf,'word2idf.json')

import json
import numpy as np
from collections import Counter

def read_json(filename):
    with open(filename, 'r') as load_f:
        file_dict = json.load(load_f)
    return file_dict

event2template = read_json('eventid2template.json')
fasttext = read_json('fasttext_map.json')
word2idf = read_json('word2idf.json')


event2semantic_vec = dict()
# todo :
# 计算每个seq的tf，然后计算句向量
for event in event2template.keys():
    template = event2template[event]
    tem_len = len(template)
    count = dict(Counter(template))
    for word in count.keys():
        # TF
        TF = count[word]/tem_len
        # IDF
        IDF = word2idf.get(word,word2idf['oov'])
        # print(word)
        # print(TF)
        # print(IDF)
        # print('-'*20)
        count[word] = TF*IDF
    # print(count)
    # print(sum(count.values()))
    value_sum = sum(count.values())
    for word in count.keys():
        count[word] = count[word]/value_sum
    semantic_vec = np.zeros(300)
    for word in count.keys():
        fasttext_weight = np.array(fasttext[word])
        semantic_vec += count[word]*fasttext_weight
    event2semantic_vec[event] = list(semantic_vec)
def dump_2_json(dump_dict, target_path):
    '''
    :param dump_dict: submits dict
    :param target_path: json dst save path
    :return:
    '''
    class MyEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, bytes):
                return str(obj, encoding='utf-8')
            return json.JSONEncoder.default(self, obj)

    file = open(target_path, 'w', encoding='utf-8')
    file.write(json.dumps(dump_dict, cls=MyEncoder, indent=4))
    file.close()

dump_2_json(event2semantic_vec,'event2semantic_vec_sameoov.json')

Originally posted by @donglee-afar in #3 (comment)

The text was updated successfully, but these errors were encountered:

Everyday-seu · 2021-08-18T02:20:18Z

I also have this question.Do you have any idea now?

heyd7fc · 2022-04-21T12:22:56Z

我也有这个问题。你现在有什么想法吗？

Have you solved this problem now？

X-zhihao · 2022-11-19T08:32:02Z

Have you solved this problem now？

Elii-hyy · 2023-02-23T08:14:46Z

what is the "deepLog_hdfs_train.txt"?

FLxuRu changed the title ~~Sorry for the late reply，~~ 请问作者，data_read('template.txt')中template.txt文件是怎么得到的？ Apr 10, 2021

FLxuRu changed the title ~~请问作者，data_read('template.txt')中template.txt文件是怎么得到的？~~ 请问作者，data_read('template.txt')中template.txt文件是怎么得到的？第二个脚本里deepLog_hdfs_train.txt文件在data文件夹下也没看到 Apr 10, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

请问作者，data_read('template.txt')中template.txt文件是怎么得到的？第二个脚本里deepLog_hdfs_train.txt文件在data文件夹下也没看到 #23

请问作者，data_read('template.txt')中template.txt文件是怎么得到的？第二个脚本里deepLog_hdfs_train.txt文件在data文件夹下也没看到 #23

FLxuRu commented Apr 10, 2021

Everyday-seu commented Aug 18, 2021

heyd7fc commented Apr 21, 2022

X-zhihao commented Nov 19, 2022

Elii-hyy commented Feb 23, 2023

请问作者，data_read('template.txt')中template.txt文件是怎么得到的？第二个脚本里deepLog_hdfs_train.txt文件在data文件夹下也没看到 #23

请问作者，data_read('template.txt')中template.txt文件是怎么得到的？第二个脚本里deepLog_hdfs_train.txt文件在data文件夹下也没看到 #23

Comments

FLxuRu commented Apr 10, 2021

Everyday-seu commented Aug 18, 2021

heyd7fc commented Apr 21, 2022

X-zhihao commented Nov 19, 2022

Elii-hyy commented Feb 23, 2023