In [13]:
#参考url:https://blog.csdn.net/zhuzuwei/article/details/80857446
from os import path
import jieba
import jieba.analyse as ana
import jieba.posseg as pseg
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import re
import os
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
from gensim import similarities
from gensim import corpora, models
from gensim.models.ldamodel import LdaModel
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest,chi2

del_dict=["\"",","," ","。","\n","(",")","g","、","-","/","#","(",")","h",",",":","“","”"] #去除特点词
input_path1 = "C:/Users/meika/Desktop/Temp/蔡/实训汇总3/2.生成文本文件/失眠症状文本/"
input_path2 = "C:/Users/meika/Desktop/Temp/蔡/实训汇总3/2.生成文本文件/失眠处方文本/"
dict_dir1 = r"C:/Users/meika/Desktop/src/#dictZZ.txt"
dict_dir2 = r"C:/Users/meika/Desktop/src/#dictCF.txt"
stopwords_dir = r"C:/Users/meika/Desktop/src/stopwords.txt"
In [14]:
input_path = input_path2
dict_dir = dict_dir2
stop_words_dict = []
with open(stopwords_dir, errors='ignore',encoding="utf-8") as fr:
    for line in fr.readlines():
        stop_words_dict.append(line.strip())
stop_words_dict
Out[14]:
['服上方',
 '自述妇科',
 '年余',
 '吹风',
 '失眠',
 '病史',
 '七剂',
 '三次',
 '两条',
 '上方',
 '各',
 '另冲',
 '另包',
 '两瓶',
 '其中',
 '冲服',
 '分包',
 '前方',
 '十剂',
 '各广',
 '各焦',
 '各熟',
 '各生',
 '服用',
 '睡眠',
 '好转',
 '欠佳',
 '如前',
 '同前',
 '边有']
In [15]:
#每个文本都做分词处理
def fenCi(src_dir,dict_dir):
    #读取txt文本
    f = open(src_dir,"a+",encoding='utf-8')
    f.seek(0)
    inTxt = f.read()
    #去掉字母数字
    inTxt = re.sub('[a-zA-Z0-9]','',inTxt)
    #加载用户词典
    jieba.load_userdict(dict_dir)
    jieba.lcut(inTxt)
    words_list = jieba.lcut(inTxt)
    #存入分词文本
    f.close()
    return [w for w in words_list if w not in stop_words_dict and len(w) > 1]

#针对tfidf,一行的分词用空格间隔
def fenCi2(inTxt):
    inTxt = re.sub('[a-zA-Z0-9]','',inTxt)
    #加载用户词典
    jieba.load_userdict(dict_dir)
    jieba.lcut(inTxt)
    words_list = jieba.lcut(inTxt)
    return ' '.join([w for w in words_list if w not in stop_words_dict and len(w) > 1])

def fenCi3(inTxt):
    inTxt = re.sub('[a-zA-Z0-9]','',inTxt)
    #加载用户词典
    jieba.load_userdict(dict_dir)
    jieba.lcut(inTxt)
    words_list = jieba.lcut(inTxt)
    return [w for w in words_list if w not in stop_words_dict and len(w) > 1]

#获得症状df 序号 内容 分词后结果
def getDataFrame(input_path,dict_dir):
    i=0;
    df = pd.DataFrame(columns=('index','text','wordlist','src'))
    whole_file = [os.path.join(input_path,file) for file in os.listdir(input_path)]
    #用列表接收
    ls =[]
    for src_dir in whole_file:
        f = open(src_dir,"a+",encoding='utf-8')
        f.seek(0)
        data = f.read()
        wordlist = fenCi(src_dir,dict_dir)
        df=df.append(pd.DataFrame({'index':[i],'text':[data],'wordlist':[wordlist],'src':[src_dir]}),ignore_index=True)
        i=i+1
    return df
In [16]:
df = getDataFrame(input_path,dict_dir)
In [17]:
df.head()
Out[17]:
index text wordlist src
0 0 黄连10 竹茹10 法夏10 陈皮10 厚朴15 白术15 枣仁30 夜交藤30 合欢皮15... [黄连, 竹茹, 法夏, 陈皮, 厚朴, 白术, 枣仁, 夜交藤, 合欢皮, 栀子, 淡竹叶... C:/Users/meika/Desktop/Temp/蔡/实训汇总3/2.生成文本文件/失...
1 1 柴胡15 白芍15 当归10 丹参20 浮小麦30 合欢皮15 苍术15 栀子1... [柴胡, 白芍, 当归, 丹参, 浮小麦, 合欢皮, 苍术, 栀子, 郁金, 沙参, 枣仁,... C:/Users/meika/Desktop/Temp/蔡/实训汇总3/2.生成文本文件/失...
2 2 柴胡15 白芍15 当归10 丹参10 浮小麦20 合欢皮10 苍术10 栀子10 沙参... [柴胡, 白芍, 当归, 丹参, 浮小麦, 合欢皮, 苍术, 栀子, 沙参, 夜交藤, 灯心... C:/Users/meika/Desktop/Temp/蔡/实训汇总3/2.生成文本文件/失...
3 3 党参20 白术15 茯苓20 焦楂曲(炒山楂 炒神曲)10 丹参15 合欢皮15 ... [党参, 白术, 茯苓, 焦楂曲, 山楂, 神曲, 丹参, 合欢皮, 枸杞, 沙参, 灯心草... C:/Users/meika/Desktop/Temp/蔡/实训汇总3/2.生成文本文件/失...
4 4 党参20 白术15 茯苓20 焦楂曲(炒山楂 炒神曲)10 丹参15 合欢皮15 ... [党参, 白术, 茯苓, 焦楂曲, 山楂, 神曲, 丹参, 合欢皮, 枸杞, 沙参, 灯心草... C:/Users/meika/Desktop/Temp/蔡/实训汇总3/2.生成文本文件/失...

2. 文档相似度的计算

1. 使用gensim中的word2vec实现

In [18]:
n_dim = 300         # 指定向量维度,大样本量是300至500较好
 
w2vmodel = Word2Vec(size = n_dim, min_count = 10)     # 至少在10个文档中出现过
w2vmodel.build_vocab(df.wordlist)                     # 生成词表
w2vmodel

w2vmodel.train(df.wordlist,total_examples = w2vmodel.corpus_count, epochs = 10)
# 训练完毕的模型实质

"""print(w2vmodel.wv['党参'].shape)
w2vmodel.wv['党参'] #相似度

print(w2vmodel.wv['紧张'].shape)
w2vmodel.wv['紧张'] #相似度
"""


#模型保存到本地w2vmodel.save("word2vec模型")
#w2vmodel.wv.save_word2vec_format("word2vec模型")
Out[18]:
"print(w2vmodel.wv['党参'].shape)\nw2vmodel.wv['党参'] #相似度\n\nprint(w2vmodel.wv['紧张'].shape)\nw2vmodel.wv['紧张'] #相似度\n"
In [19]:
"""
#保存到本地
modelZZ.wv.save_word2vec_format(filename)
#model.save("model") 保存到本地,是乱码
"""
# 词向量间的相似度
"""w2vmodel.wv.most_similar('紧张')"""
w2vmodel.wv.most_similar('三七')
Out[19]:
[('蜈蚣', 0.9999138116836548),
 ('威灵仙', 0.9999090433120728),
 ('荷叶', 0.9999072551727295),
 ('桑枝', 0.9999045729637146),
 ('丹参', 0.9999035000801086),
 ('生甘草', 0.999901533126831),
 ('怀牛膝', 0.99989914894104),
 ('仙灵脾', 0.9998983144760132),
 ('白僵蚕', 0.9998980760574341),
 ('桑叶', 0.9998971223831177)]
In [20]:
# 寻找对应关系
"""w2vmodel.wv.most_similar(positive=['紧张'],negative=['时'],topn=10)"""
w2vmodel.wv.most_similar(positive=['党参'],negative=['女贞子'],topn=10)
Out[20]:
[('山楂', 0.009990092366933823),
 ('神曲', 0.007794942706823349),
 ('焦楂曲', 0.007245061919093132),
 ('香附', 0.004245918244123459),
 ('淡豆豉', 0.0039774030447006226),
 ('柴胡', 0.003875482827425003),
 ('苍术', 0.003774590790271759),
 ('白芍', 0.0037542134523391724),
 ('白蔻仁', 0.0037468187510967255),
 ('桂枝', 0.00372130423784256)]
In [21]:
# 寻找不合群的词
"""w2vmodel.wv.doesnt_match("紧张 头痛 头晕 头痛 仍 再见".split())"""
w2vmodel.wv.doesnt_match("党参 白术 茯苓 焦楂 曲 炒 山楂".split())
E:\Anaconda\lib\site-packages\gensim\models\keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
Out[21]:
'山楂'

2. 基于词袋模型的计算:sklearn实现

In [22]:
cleanchap = [fenCi2(w) for w in df.text]
cleanchap[0:5]
Out[22]:
['黄连 竹茹 法夏 陈皮 厚朴 白术 枣仁 夜交藤 合欢皮 栀子 淡竹叶 石斛 枸杞 生地 浮小麦 煅龙 煅牡蛎 煅龙骨 甘草',
 '柴胡 白芍 当归 丹参 浮小麦 合欢皮 苍术 栀子 郁金 沙参 枣仁 夜交藤 灯心草 夏枯草 茺蔚子 杜仲',
 '柴胡 白芍 当归 丹参 浮小麦 合欢皮 苍术 栀子 沙参 夜交藤 灯心草 桑叶 五味子 枸杞 甘草',
 '党参 白术 茯苓 焦楂曲 山楂 神曲 丹参 合欢皮 枸杞 沙参 灯心草 杜仲 狗脊 煅龙 煅牡蛎 煅龙骨 五味子 黄芪 枣仁 夜交藤',
 '党参 白术 茯苓 焦楂曲 山楂 神曲 丹参 合欢皮 枸杞 沙参 灯心草 杜仲 煅龙 煅牡蛎 煅龙骨 五味子 黄芪 枣仁 夜交藤 夏枯草 肉桂']
In [23]:
countvec = CountVectorizer(min_df=3)#出现次数小于3次的不算做关键词
resmtx = countvec.fit_transform(cleanchap)
print(pairwise_distances(resmtx, metric = 'cosine').shape)
pairwise_distances(resmtx, metric = 'cosine')
(832, 832)
Out[23]:
array([[0.        , 0.71323033, 0.64459067, ..., 0.77058427, 0.72963096,
        0.72963096],
       [0.71323033, 0.        , 0.28995305, ..., 0.4375    , 0.46966991,
        0.46966991],
       [0.64459067, 0.28995305, 0.        , ..., 0.54815194, 0.5131355 ,
        0.5131355 ],
       ...,
       [0.77058427, 0.4375    , 0.54815194, ..., 0.        , 0.11611652,
        0.11611652],
       [0.72963096, 0.46966991, 0.5131355 , ..., 0.11611652, 0.        ,
        0.        ],
       [0.72963096, 0.46966991, 0.5131355 , ..., 0.11611652, 0.        ,
        0.        ]])
In [24]:
# 使用TF-IDF矩阵进行相似度计算
transformer = TfidfTransformer()
#将词频矩阵X统计成TF-IDF值 
tfidf = transformer.fit_transform(resmtx)         # 基于词频矩阵X计算TF-IDF值
pairwise_distances(tfidf[:5],metric='cosine')
Out[24]:
array([[0.        , 0.83278922, 0.76454455, 0.77997571, 0.76252693],
       [0.83278922, 0.        , 0.36079435, 0.62778922, 0.54494498],
       [0.76454455, 0.36079435, 0.        , 0.59348846, 0.56125051],
       [0.77997571, 0.62778922, 0.59348846, 0.        , 0.20117364],
       [0.76252693, 0.54494498, 0.56125051, 0.20117364, 0.        ]])

3 gensim实现: gensim计算的相似矩阵很难被sklearn直接使用

In [25]:
#要求列表嵌套列表格式
chaplist = [w for w in df.wordlist]
#chaplist
In [27]:
dictionary = corpora.Dictionary(chaplist)
corpus = [dictionary.doc2bow(text) for text in chaplist]     # 仍为list of list
 
simmtx = similarities.MatrixSimilarity(corpus)
simmtx
Out[27]:
<gensim.similarities.docsim.MatrixSimilarity at 0x228bbf68a20>

4. 基于LDA计算余弦相似度

In [28]:
#LDA参考文档https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html#sphx-glr-auto-examples-tutorials-run-lda-py
simmtx.index[:].shape
Out[28]:
(832, 310)
In [29]:
# 使用gensim的LDA拟合结果进行演示
tfidf_model = models.TfidfModel(corpus)        # 建立TF-IDF模型
corpus_tfidf = tfidf_model[corpus]             # 对所需文档计算TF-IDF结果
ldamodel = LdaModel(corpus_tfidf, id2word = dictionary, num_topics = 5, passes = 5)
#保存模型ldamodel.save("LDA模型(处方)")
#文档主题单词分布
topic_list= ldamodel.print_topics(5)
for topic in topic_list:
    print(topic)
(0, '0.015*"当归" + 0.015*"灯芯" + 0.014*"五味子" + 0.014*"桂枝" + 0.013*"煅牡蛎" + 0.013*"煅龙骨" + 0.013*"苍术" + 0.013*"白芷" + 0.013*"煅龙" + 0.013*"知母"')
(1, '0.021*"夏枯草" + 0.021*"神曲" + 0.020*"焦楂曲" + 0.020*"山楂" + 0.019*"枳壳" + 0.018*"知母" + 0.017*"郁金" + 0.017*"栀子" + 0.017*"百合" + 0.017*"柴胡"')
(2, '0.024*"连翘" + 0.020*"二花" + 0.018*"牡蛎" + 0.016*"麦冬" + 0.016*"射干" + 0.015*"葛根" + 0.015*"胖大海" + 0.014*"山药" + 0.014*"玄参" + 0.014*"天麻"')
(3, '0.018*"薏苡仁" + 0.018*"葛根" + 0.016*"地肤子" + 0.016*"磁石" + 0.015*"胆草" + 0.014*"黄芩" + 0.014*"枳实" + 0.014*"玄参" + 0.014*"川芎" + 0.013*"鸡血藤"')
(4, '0.023*"厚朴" + 0.019*"陈皮" + 0.018*"砂仁" + 0.018*"煅龙" + 0.017*"煅牡蛎" + 0.017*"煅龙骨" + 0.017*"白术" + 0.016*"柴胡" + 0.016*"枳壳" + 0.016*"太子参"')
In [31]:
# 检索和第一行内容最相似的其他行
query = df.text[1]

quer_bow = dictionary.doc2bow(fenCi3(query))
lda_vec = ldamodel[quer_bow]         # 转换为lda模型下的向量
sims = simmtx[lda_vec]               # 进行矩阵内向量和所提供向量的余弦相似度查询
sims = sorted(enumerate(sims), key = lambda item:-item[1])
sims[0:5]
Out[31]:
[(462, 0.47420555),
 (252, 0.3778452),
 (111, 0.32406497),
 (105, 0.30519068),
 (346, 0.3051896)]

3. 特征提取

1.tf-idf

In [33]:
#参考url:https://zhuanlan.zhihu.com/p/54785656
corpus = cleanchap
#将文本中的词语转换为词频矩阵  
vectorizer = CountVectorizer(min_df=3)  
#计算个词语出现的次数  
X = vectorizer.fit_transform(corpus)  
#获取词袋中所有文本关键词  
word = vectorizer.get_feature_names()   
#查看词频结果  
#word ,
X.toarray()  
Out[33]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0]], dtype=int64)
In [34]:
#类调用  
transformer = TfidfTransformer()  
#将词频矩阵X统计成TF-IDF值  
tfidf = transformer.fit_transform(X)  
#查看数据结构 tfidf[i][j]表示i类文本中的tf-idf权重  
transformer  ,tfidf.toarray() 
Out[34]:
(TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True),
 array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.29098758,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.26483224,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.26483224,
         0.        ]]))
In [43]:
#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重  
weight = tfidf.toarray() 
for i in range(len(weight)):#打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重  
    print(u"-------这里输出第",i,u"类文本的词语tf-idf权重------") 
    for j in range(len(word)):  
        if(weight[i][j]>0):
            print(word[j],weight[i][j])
    #输出前2类文本
    if i ==1:
        break
-------这里输出第 0 类文本的词语tf-idf权重------
厚朴 0.2599071368942431
合欢皮 0.13564980513384325
夜交藤 0.13601150659744962
枣仁 0.11846970783984959
枸杞 0.21243562473278024
栀子 0.27583856876919294
法夏 0.2952584300567002
浮小麦 0.1741052339488054
淡竹叶 0.3340910523011811
煅牡蛎 0.20348962325033368
煅龙 0.20544527202855584
煅龙骨 0.20348962325033368
甘草 0.12591475399316468
生地 0.2163714306470172
白术 0.1494862956254793
石斛 0.25703661578103576
竹茹 0.3636946904303227
陈皮 0.287969572493419
黄连 0.19970273525791937
-------这里输出第 1 类文本的词语tf-idf权重------
丹参 0.33011521031790053
合欢皮 0.1441647721642138
夏枯草 0.24103205673809006
夜交藤 0.1445491782386702
当归 0.18098489298552253
杜仲 0.28601314252670706
枣仁 0.12590625119027035
柴胡 0.17602122373868997
栀子 0.2931534209096498
沙参 0.3433316256664417
浮小麦 0.18503411309777487
灯心草 0.2673500915525717
白芍 0.15864598283361986
苍术 0.20075081635880296
茺蔚子 0.46258894067678774
郁金 0.2021569948955979

4. 文档聚类

In [44]:
clf = KMeans(n_clusters = 5)
s = clf.fit(tfidf)
print(s)
clf.cluster_centers_
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)
Out[44]:
array([[0.00594933, 0.0063435 , 0.00927094, ..., 0.        , 0.04244838,
        0.        ],
       [0.01426449, 0.00163154, 0.        , ..., 0.        , 0.00898938,
        0.        ],
       [0.01557607, 0.00534587, 0.01642123, ..., 0.00193179, 0.01918467,
        0.00117081],
       [0.01438473, 0.        , 0.00711562, ..., 0.00328925, 0.03507288,
        0.00629952],
       [0.0102389 , 0.        , 0.00325375, ..., 0.01116146, 0.12522392,
        0.        ]])
In [84]:
print(len(clf.labels_))
clf.labels_
832
Out[84]:
array([0, 4, 4, 0, 0, 3, 3, 4, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 0,
       3, 3, 4, 4, 1, 1, 4, 2, 2, 1, 4, 4, 4, 4, 1, 1, 1, 3, 1, 1, 1, 1,
       0, 0, 3, 0, 4, 4, 4, 4, 4, 3, 4, 3, 3, 4, 4, 4, 4, 0, 0, 0, 0, 4,
       2, 0, 0, 3, 3, 1, 1, 1, 1, 3, 4, 2, 4, 4, 4, 4, 1, 0, 0, 0, 4, 3,
       0, 1, 1, 4, 4, 4, 4, 4, 3, 0, 3, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 4, 0, 0, 4, 1, 1, 4, 4, 4, 4, 4, 1, 0, 4, 4, 0, 0, 4, 4, 4,
       3, 1, 1, 1, 1, 1, 1, 1, 1, 0, 3, 4, 4, 4, 0, 0, 3, 1, 1, 4, 4, 4,
       2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 0, 4, 1, 0, 1,
       4, 4, 1, 1, 3, 4, 4, 0, 0, 0, 0, 4, 3, 3, 0, 0, 4, 3, 1, 1, 4, 0,
       4, 3, 4, 3, 1, 4, 1, 3, 4, 3, 0, 3, 3, 4, 3, 4, 4, 3, 4, 4, 3, 0,
       0, 1, 4, 3, 1, 1, 3, 3, 1, 4, 3, 4, 4, 0, 4, 4, 4, 4, 4, 2, 1, 1,
       2, 0, 4, 1, 0, 0, 3, 3, 4, 4, 4, 3, 4, 4, 1, 3, 3, 0, 4, 0, 0, 1,
       3, 4, 4, 2, 2, 2, 2, 2, 4, 3, 3, 4, 4, 3, 4, 4, 3, 1, 3, 0, 0, 1,
       4, 1, 4, 3, 3, 3, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 0,
       0, 0, 0, 1, 3, 3, 3, 3, 4, 1, 4, 3, 4, 1, 1, 4, 2, 4, 4, 4, 4, 2,
       0, 1, 1, 0, 1, 1, 4, 2, 1, 1, 1, 1, 1, 3, 1, 1, 1, 4, 1, 2, 3, 3,
       2, 2, 0, 0, 0, 1, 0, 1, 1, 1, 3, 4, 4, 4, 1, 0, 0, 3, 3, 0, 0, 0,
       0, 0, 2, 2, 2, 3, 4, 0, 3, 1, 3, 1, 4, 3, 1, 1, 1, 4, 1, 0, 3, 3,
       4, 4, 0, 0, 0, 0, 4, 2, 2, 0, 3, 0, 0, 0, 0, 0, 0, 4, 4, 0, 4, 3,
       3, 3, 0, 0, 0, 4, 3, 0, 4, 4, 0, 0, 0, 0, 4, 0, 3, 3, 3, 1, 1, 0,
       4, 1, 1, 1, 1, 4, 4, 4, 4, 0, 1, 1, 1, 1, 1, 1, 4, 1, 3, 3, 3, 0,
       4, 3, 3, 3, 1, 3, 1, 1, 1, 3, 0, 3, 4, 3, 2, 1, 2, 2, 2, 0, 1, 1,
       1, 2, 1, 1, 2, 2, 4, 4, 4, 0, 3, 0, 3, 2, 0, 0, 3, 1, 1, 4, 0, 0,
       3, 0, 4, 4, 0, 4, 3, 4, 4, 4, 1, 1, 1, 3, 1, 1, 4, 0, 3, 2, 4, 2,
       0, 0, 1, 4, 4, 4, 0, 3, 3, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 3, 1, 3,
       3, 3, 1, 1, 0, 4, 3, 3, 4, 4, 1, 1, 0, 1, 4, 3, 3, 3, 3, 3, 0, 2,
       3, 4, 0, 4, 4, 4, 3, 3, 3, 1, 0, 3, 2, 2, 2, 2, 3, 2, 0, 1, 1, 4,
       0, 0, 0, 3, 3, 4, 0, 3, 3, 4, 0, 0, 0, 4, 4, 1, 3, 1, 1, 1, 0, 4,
       4, 4, 4, 0, 0, 0, 4, 4, 4, 3, 4, 0, 0, 0, 0, 1, 0, 0, 0, 4, 4, 3,
       1, 0, 3, 4, 4, 2, 1, 1, 1, 1, 3, 3, 4, 4, 0, 0, 4, 4, 0, 4, 4, 4,
       1, 0, 1, 4, 3, 1, 3, 1, 0, 0, 0, 1, 3, 3, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 3, 0, 0, 0, 0, 1, 1, 4, 4, 4, 2, 1, 4, 1, 3, 3, 3, 1, 0, 3,
       0, 3, 2, 2, 3, 3, 3, 4, 4, 0, 4, 3, 4, 4, 4, 0, 3, 0, 0, 4, 0, 4,
       4, 0, 4, 0, 3, 4, 4, 4, 0, 4, 2, 0, 3, 0, 1, 1, 1, 1, 1, 2, 1, 4,
       4, 1, 3, 3, 3, 3, 3, 4, 3, 4, 4, 4, 0, 0, 0, 0, 4, 0, 0, 0, 4, 4,
       3, 0, 1, 1, 0, 0, 1, 0, 1, 0, 2, 2, 1, 1, 1, 4, 2, 2, 2, 4, 3, 1,
       4, 4, 1, 1, 3, 0, 4, 1, 3, 3, 1, 1, 1, 1, 4, 4, 3, 0, 0, 0, 4, 0,
       2, 1, 1, 1, 3, 4, 0, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4])
In [45]:
#聚类后打上标记,方便后面分类
df['clsres'] = clf.labels_
df.head()
Out[45]:
index text wordlist src clsres
0 0 黄连10 竹茹10 法夏10 陈皮10 厚朴15 白术15 枣仁30 夜交藤30 合欢皮15... [黄连, 竹茹, 法夏, 陈皮, 厚朴, 白术, 枣仁, 夜交藤, 合欢皮, 栀子, 淡竹叶... C:/Users/meika/Desktop/Temp/蔡/实训汇总3/2.生成文本文件/失... 1
1 1 柴胡15 白芍15 当归10 丹参20 浮小麦30 合欢皮15 苍术15 栀子1... [柴胡, 白芍, 当归, 丹参, 浮小麦, 合欢皮, 苍术, 栀子, 郁金, 沙参, 枣仁,... C:/Users/meika/Desktop/Temp/蔡/实训汇总3/2.生成文本文件/失... 2
2 2 柴胡15 白芍15 当归10 丹参10 浮小麦20 合欢皮10 苍术10 栀子10 沙参... [柴胡, 白芍, 当归, 丹参, 浮小麦, 合欢皮, 苍术, 栀子, 沙参, 夜交藤, 灯心... C:/Users/meika/Desktop/Temp/蔡/实训汇总3/2.生成文本文件/失... 2
3 3 党参20 白术15 茯苓20 焦楂曲(炒山楂 炒神曲)10 丹参15 合欢皮15 ... [党参, 白术, 茯苓, 焦楂曲, 山楂, 神曲, 丹参, 合欢皮, 枸杞, 沙参, 灯心草... C:/Users/meika/Desktop/Temp/蔡/实训汇总3/2.生成文本文件/失... 1
4 4 党参20 白术15 茯苓20 焦楂曲(炒山楂 炒神曲)10 丹参15 合欢皮15 ... [党参, 白术, 茯苓, 焦楂曲, 山楂, 神曲, 丹参, 合欢皮, 枸杞, 沙参, 灯心草... C:/Users/meika/Desktop/Temp/蔡/实训汇总3/2.生成文本文件/失... 1
In [46]:
#按类簇合并词
chapgrp = df.groupby('clsres')
chapcls = chapgrp.agg(sum)              # 只有字符串列的情况下,sum函数自动转为合并字符串
 
cuttxt = lambda x: ' '.join(fenCi3(x))
chapclsres = chapcls.text.apply(cuttxt)
chapclsres
Out[46]:
clsres
0    熟地 山药 山茱萸 黄芪 桂枝 附片 厚朴 黄柏 知母 白术 煅龙 煅牡蛎 煅龙骨 怀牛膝 ...
1    黄连 竹茹 法夏 陈皮 厚朴 白术 枣仁 夜交藤 合欢皮 栀子 淡竹叶 石斛 枸杞 生地 浮...
2    柴胡 白芍 当归 丹参 浮小麦 合欢皮 苍术 栀子 郁金 沙参 枣仁 夜交藤 灯心草 夏枯草...
3    党参 白术 茯苓 焦楂曲 山楂 神曲 柏子仁 丹参 黄芪 枸杞 柴胡 枣仁 黄连 夜交藤 夏...
4    柴胡 黄芩 法夏 厚朴 生晒参 自备 川芎 黄连 桂枝 肉桂 生甘草 百合 益智仁 枳壳 白...
Name: text, dtype: object
In [47]:
#取出停用词,过滤出关键词

# 列出关键词以刻画类别特征
ana.set_stop_words(stopwords_dir)
 
for item in chapclsres:
    print(ana.extract_tags(item, topK = 10))
['熟地', '枣仁', '山药', '黄芪', '山茱萸', '夜交藤', '甘草', '茯苓', '白术', '当归']
['煅牡蛎', '煅龙骨', '煅龙', '合欢皮', '枣仁', '浮小麦', '夜交藤', '茯神', '甘草', '川芎']
['枣仁', '合欢皮', '白术', '黄芪', '白芍', '夜交藤', '甘草', '浮小麦', '柴胡', '玄参']
['焦楂曲', '神曲', '山楂', '合欢皮', '枣仁', '白术', '黄芪', '甘草', '白芍', '夜交藤']
['枣仁', '夜交藤', '合欢皮', '茯神', '知母', '川芎', '龙齿', '甘草', '苍术', '郁金']

4. 文档分类

1直接分类

In [48]:
cutlist = lambda x: ' '.join(x)
x_data = df.wordlist.apply(cutlist)
y_data = list(df.clsres)

# (1) 按词频分类
count_vectorizer = CountVectorizer(min_df = 2)
all_words_count = count_vectorizer.fit_transform(x_data)
#x_data
#for key,value in count_vectorizer.vocabulary_.items():
#    print(key,value)

#列表形式呈现文章生成的词典
#count_vectorizer.get_feature_names()
#字典形式呈现,key:词,value:词频
count_vectorizer.vocabulary_
#第0个列表元素,**词典中索引为3的元素**, 词频
#for i in all_words_count:
#    print(i)
Out[48]:
{'黄连': 237,
 '竹茹': 152,
 '法夏': 91,
 '陈皮': 214,
 '厚朴': 27,
 '白术': 130,
 '枣仁': 68,
 '夜交藤': 36,
 '合欢皮': 28,
 '栀子': 76,
 '淡竹叶': 96,
 '石斛': 143,
 '枸杞': 72,
 '生地': 123,
 '浮小麦': 95,
 '煅龙': 108,
 '煅牡蛎': 106,
 '煅龙骨': 109,
 '甘草': 122,
 '柴胡': 75,
 '白芍': 131,
 '当归': 56,
 '丹参': 3,
 '苍术': 171,
 '郁金': 205,
 '沙参': 90,
 '灯心草': 100,
 '夏枯草': 35,
 '茺蔚子': 177,
 '杜仲': 65,
 '桑叶': 79,
 '五味子': 10,
 '党参': 18,
 '茯苓': 175,
 '焦楂曲': 105,
 '山楂': 50,
 '神曲': 150,
 '狗脊': 114,
 '黄芪': 236,
 '肉桂': 163,
 '柏子仁': 74,
 '生甘草': 125,
 '太子参': 42,
 '百合': 136,
 '胡黄连': 167,
 '川芎': 53,
 '决明子': 21,
 '牛蒡子': 112,
 '淡豆豉': 97,
 '枳壳': 70,
 '茯神': 174,
 '砂仁': 148,
 '黄精': 234,
 '炒栀子': 103,
 '丹皮': 4,
 '蜈蚣': 191,
 '龙齿': 241,
 '葛根': 183,
 '苦参': 173,
 '知母': 141,
 '天麻': 41,
 '灵芝': 102,
 '黄芩': 235,
 '升麻': 25,
 '生晒参': 124,
 '诃子': 195,
 '白芷': 133,
 '藿香': 188,
 '焦三仙': 104,
 '煅瓦楞子': 107,
 '香附': 222,
 '佛手': 14,
 '桂枝': 77,
 '灯芯': 101,
 '益智仁': 139,
 '自备': 169,
 '玄参': 116,
 '珍珠母': 119,
 '附片': 213,
 '熟地': 110,
 '天花粉': 39,
 '全瓜': 19,
 '怀牛膝': 57,
 '车前子': 201,
 '山药': 52,
 '山茱萸': 51,
 '黄柏': 233,
 '独活': 115,
 '寄生': 46,
 '小蓟': 49,
 '天门冬': 40,
 '麦冬': 231,
 '威灵仙': 44,
 '龙眼肉': 238,
 '红景天': 156,
 '丝瓜络': 2,
 '土鳖': 31,
 '防风': 210,
 '石菖蒲': 146,
 '远志': 203,
 '三七': 0,
 '水蛭': 88,
 '续断': 160,
 '五加皮': 9,
 '全蝎': 20,
 '白僵蚕': 128,
 '补骨脂': 193,
 '骨碎补': 224,
 '桑寄生': 80,
 '桔红': 85,
 '浙贝': 93,
 '射干': 48,
 '桑枝': 81,
 '干姜': 55,
 '鸡血藤': 229,
 '紫菀': 154,
 '女贞子': 43,
 '旱莲草': 59,
 '生草': 126,
 '蛇舌草': 190,
 '细辛': 158,
 '半枝': 26,
 '白蔻仁': 134,
 '阿胶': 212,
 '首乌': 220,
 '桑白皮': 82,
 '玫瑰花': 118,
 '红花': 157,
 '益母草': 140,
 '石膏': 145,
 '前仁': 23,
 '胆草': 165,
 '侧柏叶': 16,
 '二花': 8,
 '百部': 137,
 '败酱草': 197,
 '杏仁': 64,
 '地骨皮': 34,
 '泽泻': 92,
 '紫苏': 153,
 '颗粒剂': 219,
 '薏苡仁': 187,
 '地肤子': 33,
 '连翘': 204,
 '土茯苓': 30,
 '巴戟天': 54,
 '淫羊': 98,
 '果仁': 67,
 '花粉': 170,
 '西洋参': 194,
 '仙灵脾': 11,
 '牡蛎': 113,
 '赤石脂': 199,
 '石榴皮': 144,
 '磁石': 149,
 '金刚': 206,
 '木香': 63,
 '木蝴蝶': 61,
 '乌贼骨': 7,
 '赤芍': 200,
 '水煎服': 87,
 '沉香': 89,
 '佩兰': 15,
 '锻龙': 209,
 '大黄': 38,
 '皂角刺': 138,
 '木瓜': 60,
 '蔓荆子': 185,
 '密蒙花': 47,
 '桑葚': 83,
 '菊花': 180,
 '青葙子': 215,
 '肉苁蓉': 164,
 '枳实': 71,
 '荷叶': 178,
 '十四': 24,
 '枸杞子': 73,
 '桃仁': 78,
 '辛夷': 202,
 '三草': 1,
 '安神': 45,
 '胶囊': 168,
 '吴茱萸': 29,
 '瓦楞子': 121,
 '田七': 127,
 '伸筋草': 13,
 '鸡内金': 228,
 '蒲公英': 184,
 '龙骨': 240,
 '石韦': 147,
 '萹蓄': 182,
 '枣皮': 69,
 '白芨': 132,
 '龙胆草': 239,
 '阳起石': 211,
 '钩藤': 208,
 '龟胶': 242,
 '熬膏': 111,
 '薏仁': 186,
 '颗粒': 218,
 '香薷': 221,
 '大枣': 37,
 '段龙': 86,
 '莲子心': 179,
 '菟丝子': 181,
 '鹿角霜': 230,
 '僵蚕': 17,
 '谷精草': 196,
 '琥珀': 120,
 '木通': 62,
 '绞股蓝': 159,
 '红参': 155,
 '麻黄': 232,
 '胖大海': 166,
 '鱼腥草': 226,
 '茵陈': 176,
 '白寇': 129,
 '桔梗': 84,
 '绿萼': 161,
 '竹叶': 151,
 '刘寄奴': 22,
 '青蒿': 216,
 '贯众': 198,
 '羌活': 162,
 '苏子': 172,
 '高良姜': 225,
 '韭菜子': 217,
 '蛇床子': 189,
 '浙贝母': 94,
 '蝉衣': 192,
 '白薇': 135,
 '石决明': 142,
 '火麻仁': 99,
 '金钱草': 207,
 '乌药': 6,
 '扁豆': 58,
 '鳖甲': 227,
 '伏神': 12,
 '马勃': 223,
 '玉竹': 117,
 '板蓝根': 66,
 '地榆': 32,
 '乌梢蛇': 5}
In [49]:
x_train,x_test,y_train,y_test = train_test_split(all_words_count, y_data, test_size = 0.2)

# 尝试逻辑回归和SVM

my_lr = LogisticRegression()
my_svm1 = SVC(kernel = 'linear')
my_svm2 = SVC(kernel='rbf')
In [50]:
%time my_lr.fit(x_train,y_train)
%time my_svm1.fit(x_train,y_train)
%time my_svm2.fit(x_train,y_train)

print(classification_report(y_test, my_lr.predict(x_test)))
print(classification_report(y_test, my_svm1.predict(x_test)))
print(classification_report(y_test, my_svm2.predict(x_test)))
Wall time: 72.8 ms
Wall time: 58.8 ms
Wall time: 123 ms
             precision    recall  f1-score   support

          0       1.00      0.88      0.93        16
          1       0.98      1.00      0.99        44
          2       0.93      1.00      0.96        52
          3       0.97      0.97      0.97        29
          4       1.00      0.88      0.94        26

avg / total       0.97      0.96      0.96       167

             precision    recall  f1-score   support

          0       1.00      0.88      0.93        16
          1       0.98      1.00      0.99        44
          2       0.95      1.00      0.97        52
          3       0.97      0.97      0.97        29
          4       1.00      0.92      0.96        26

avg / total       0.97      0.97      0.97       167

             precision    recall  f1-score   support

          0       1.00      0.12      0.22        16
          1       0.94      1.00      0.97        44
          2       0.72      1.00      0.84        52
          3       0.97      0.97      0.97        29
          4       1.00      0.65      0.79        26

avg / total       0.89      0.86      0.83       167

In [51]:
# (2) 只考虑词是否出现(词频置为1)
tests = np.nonzero(all_words_count)    # 找出非0值的行列索引
In [52]:
new_all_words_count = all_words_count
new_all_words_count[tests[0],tests[1]]=1
In [53]:
x_train,x_test,y_train,y_test = train_test_split(new_all_words_count, y_data, test_size = 0.2)
 
my_lr = LogisticRegression()
my_svm1 = SVC(kernel = 'linear')
my_svm2 = SVC(kernel='rbf')
 
%time my_lr.fit(x_train,y_train)
%time my_svm1.fit(x_train,y_train)
%time my_svm2.fit(x_train,y_train)

print(classification_report(y_test, my_lr.predict(x_test)))
print(classification_report(y_test, my_svm1.predict(x_test)))
print(classification_report(y_test, my_svm2.predict(x_test)))
Wall time: 11 ms
Wall time: 43.9 ms
Wall time: 125 ms
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        18
          1       0.93      1.00      0.96        39
          2       0.96      0.98      0.97        52
          3       1.00      0.88      0.94        26
          4       1.00      0.97      0.98        32

avg / total       0.97      0.97      0.97       167

             precision    recall  f1-score   support

          0       0.95      1.00      0.97        18
          1       0.93      1.00      0.96        39
          2       0.96      0.87      0.91        52
          3       0.96      0.88      0.92        26
          4       0.86      0.94      0.90        32

avg / total       0.93      0.93      0.93       167

             precision    recall  f1-score   support

          0       1.00      0.17      0.29        18
          1       0.83      1.00      0.91        39
          2       0.68      0.98      0.80        52
          3       0.96      0.88      0.92        26
          4       1.00      0.56      0.72        32

avg / total       0.85      0.80      0.77       167

In [54]:
#显示每个词向量的归为哪一类
my_lr.predict(x_test)
Out[54]:
array([1, 1, 2, 3, 2, 4, 0, 3, 0, 1, 2, 0, 4, 3, 1, 3, 0, 4, 1, 1, 2, 3,
       0, 4, 4, 3, 2, 2, 1, 1, 4, 0, 1, 1, 1, 0, 1, 2, 4, 4, 2, 1, 2, 2,
       1, 4, 1, 4, 1, 2, 1, 3, 1, 3, 3, 4, 2, 2, 2, 4, 1, 2, 4, 2, 4, 2,
       0, 1, 1, 1, 1, 4, 2, 2, 0, 4, 4, 1, 4, 0, 2, 4, 3, 2, 1, 3, 4, 4,
       1, 1, 0, 0, 4, 3, 1, 2, 2, 1, 3, 4, 4, 0, 1, 2, 3, 2, 4, 1, 3, 2,
       2, 2, 1, 2, 0, 4, 4, 1, 4, 2, 1, 2, 3, 1, 2, 3, 2, 2, 4, 2, 2, 2,
       1, 2, 2, 3, 3, 3, 2, 0, 4, 3, 2, 2, 2, 3, 2, 2, 1, 2, 1, 2, 0, 2,
       1, 2, 2, 1, 3, 2, 2, 0, 1, 0, 4, 1, 2])

2.PCA降维

In [55]:
pca = PCA(n_components=0.9)
#PCA不支持sparse mat的操作,先转换为标准矩阵
#n_components: int, float, None 或 string,PCA算法中所要保留的主成分个数,也即保留下来的特征个数,如果 n_components = 1,将把原始数据降到一维;如果赋值为string,如n_components='mle',将自动选取特征个数,使得满足所要求的方差百分比;如果没有赋值,默认为None,特征个数不会改变(特征数据本身会改变)。
#copy:True 或False,默认为True,即是否需要将原始训练数据复制。
#whiten:True 或False,默认为False,即是否白化,使得每个特征具有相同的方差。
all_wc_mtx = all_words_count.todense()
new_x = pca.fit_transform(all_wc_mtx)
new_x_train,new_x_test,new_y_train,new_y_test = train_test_split(new_x,y_data,test_size = 0.3)
new_x_train.shape
Out[55]:
(582, 81)
In [56]:
my_lr2 = LogisticRegression()
my_svm21 = SVC(kernel='linear')
my_svm22 = SVC(kernel='rbf')

%time my_lr2.fit(new_x_train, new_y_train)
%time my_svm21.fit(new_x_train, new_y_train)
%time my_svm22.fit(new_x_train, new_y_train)

print(classification_report(new_y_test, my_lr2.predict(new_x_test)))
print(classification_report(new_y_test, my_svm21.predict(new_x_test)))
print(classification_report(new_y_test, my_svm22.predict(new_x_test)))
Wall time: 34.9 ms
Wall time: 32.9 ms
Wall time: 44.9 ms
             precision    recall  f1-score   support

          0       1.00      0.92      0.96        24
          1       0.97      1.00      0.99        69
          2       0.94      0.96      0.95        80
          3       0.97      0.97      0.97        39
          4       0.94      0.89      0.92        38

avg / total       0.96      0.96      0.96       250

             precision    recall  f1-score   support

          0       1.00      0.96      0.98        24
          1       0.97      0.97      0.97        69
          2       0.93      0.95      0.94        80
          3       0.97      0.97      0.97        39
          4       0.86      0.84      0.85        38

avg / total       0.94      0.94      0.94       250

             precision    recall  f1-score   support

          0       1.00      0.54      0.70        24
          1       0.93      1.00      0.97        69
          2       0.84      0.93      0.88        80
          3       0.97      0.95      0.96        39
          4       0.86      0.84      0.85        38

avg / total       0.91      0.90      0.90       250

3.使用卡方检验进行特征选择后再分类

In [57]:
"""可尝试选不同的k"""
model1 = SelectKBest(chi2, k=100)       # 选择100个最好的特征
new_x2 = model1.fit_transform(all_words_count,y_data)
new_x2
Out[57]:
<832x100 sparse matrix of type '<class 'numpy.int64'>'
	with 10732 stored elements in Compressed Sparse Row format>
In [58]:
new_x_train2, new_x_test2, new_y_train2, new_y_test2 = train_test_split(new_x2, y_data, test_size = 0.3)
 
my_lr3 = LogisticRegression()
my_svm31 = SVC(kernel='linear')
my_svm32 = SVC(kernel='rbf')

%time my_lr3.fit(new_x_train2, new_y_train2)
%time my_svm31.fit(new_x_train2, new_y_train2)
%time my_svm32.fit(new_x_train2, new_y_train2)
 
print(classification_report(new_y_test2, my_lr3.predict(new_x_test2)))
print(classification_report(new_y_test2, my_svm31.predict(new_x_test2)))
print(classification_report(new_y_test2, my_svm32.predict(new_x_test2)))
Wall time: 12 ms
Wall time: 27.9 ms
Wall time: 59.9 ms
             precision    recall  f1-score   support

          0       0.93      0.93      0.93        15
          1       0.98      1.00      0.99        65
          2       0.94      0.95      0.94        76
          3       1.00      0.96      0.98        45
          4       0.92      0.92      0.92        49

avg / total       0.96      0.96      0.96       250

             precision    recall  f1-score   support

          0       0.82      0.93      0.87        15
          1       0.98      1.00      0.99        65
          2       0.93      0.91      0.92        76
          3       0.98      0.93      0.95        45
          4       0.88      0.90      0.89        49

avg / total       0.94      0.94      0.94       250

             precision    recall  f1-score   support

          0       1.00      0.47      0.64        15
          1       0.96      1.00      0.98        65
          2       0.80      0.97      0.88        76
          3       0.96      0.96      0.96        45
          4       0.95      0.71      0.81        49

avg / total       0.91      0.90      0.89       250