合并词向量预训练模型和自训练模型
合并词向量预训练模型和自训练模型,其中的tokenizer来自上一篇文章。
import numpy as np import codecs EMBEDDING_FILE='../glove.6B/crawl-300d-2M.vec' EMBEDDING_TRAIN = '../glove.6B/vectors_train.txt' embed_size = 300 #EMBEDDING_FILE=INPUT+'glove.6B/glove.6B.300d.txt' cn = 0 def get_coefs(word,*arr): global cn cn += 1 dict_v = np.asarray(arr, dtype='float32') if len(dict_v)<>embed_size: dict_v = np.zeros((embed_size)) return word, dict_v f_emb = codecs.open(EMBEDDING_FILE) emb_list = f_emb.readlines() embeddings_index = dict(get_coefs(*o.strip().split()) for o in emb_list) print cn f_emb.close() f_emb = codecs.open(EMBEDDING_TRAIN,'r','utf-8') emb_list = f_emb.readlines() cn = 0 embeddings_index_train = dict(get_coefs(*o.strip().split()) for o in emb_list) print cn f_emb.close() all_embs = np.stack(embeddings_index.values()) emb_mean,emb_std = all_embs.mean(), all_embs.std() print emb_mean,emb_std word_index = tokenizer.word_index nb_words = min(max_features, len(word_index)) embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) novector = 0 for word, i in word_index.items(): if i >= nb_words: continue embedding_vector = embeddings_index.get(word) embedding_vector_train = embeddings_index_train.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector elif embedding_vector_train is not None: embedding_matrix[i] = embedding_vector_train else: print word novector += 1
其中的自训练模型如下。
full_df = pd.concat([train, test]) full_df.to_csv('text.txt',index=False,sep=' ',quotechar=' ',columns=['text'],header=False,encoding='utf-8') """ Get 'text.txt'. https://nlp.stanford.edu/projects/glove/ GloVe-1.2.zip demo.sh: CORPUS=text.txt VOCAB_FILE=vocab.txt COOCCURRENCE_FILE=cooccurrence.bin COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin BUILDDIR=build SAVE_FILE=vectors_train VERBOSE=2 MEMORY=4.0 VOCAB_MIN_COUNT=2 VECTOR_SIZE=300 MAX_ITER=60 WINDOW_SIZE=15 BINARY=2 NUM_THREADS=8 X_MAX=10 You can get "vectors_train.txt". """