Python中文自然语言处理：一、基础文本处理

2017/11/30 0 人评论 38,791 次阅读

文章目录

对中文进行分词
去除文本中的标点符号
生成一个单词的起始位置
去除重复词
对文本应用Zipf定律
相似性度量

对中文进行分词

import jieba

text = '你好，我正在进行Python自然语言处理，有些问题需要处理,笑哈哈'

word = jieba.cut(text)
word_list = ' '.join(word).split(' ')
print(word_list)

输出：

['你好', '，', '我', '正在', '进行', 'Python', '自然语言', '处理', '，', '有些', '问题', '需要', '处理', ',', '笑哈哈']

去除文本中的标点符号

import re

reg = r"[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&amp;*（）]+"

for i in word_list:
    result = re.match(reg,i)
    if result != None:
        word_list.remove(i)
print(word_list)

输出：

['你好', '我', '正在', '进行', 'Python', '自然语言', '处理', '有些', '问题', '需要', '处理', '笑哈哈']

生成一个单词的起始位置

text_no_punp = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&amp;*（）]+", "",text)

print(list(jieba.tokenize(text_no_punp)))

输出：

[('你好', 0, 2), ('我', 2, 3), ('正在', 3, 5), ('进行', 5, 7), ('Python', 7, 13), ('自然语言', 13, 17), ('处理', 17, 19), ('有些', 19, 21), ('问题', 21, 23), ('需要', 23, 25), ('处理', 25, 27), ('笑哈哈', 27, 30)]

去除重复词

class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
    def replace(self,word):
        repl_word = self.repeat_regexp.sub(self.repl,word)
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word

replacer = RepeatReplacer()

replacer.replace("高高兴兴")

输出：

'高兴'

对文本应用Zipf定律

import nltk
from nltk.corpus import brown
from nltk.probability import FreqDist
import matplotlib
import matplotlib.pyplot as plt

# 解决中文和负号显示
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False

matplotlib.use('MacOSX')
fd = FreqDist()
for text in gutenberg.fileids():
    for word in gutenberg.words(text):
        fd[word]+= 1
ranks = []
freqs = []
for rank, word in enumerate(fd):
    ranks.append(rank+1)
    freqs.append(fd[word])
plt.figure(figsize=(15,8))
plt.loglog(ranks,freqs,'.-')
plt.xlabel('词频(f)', fontsize=14, fontweight='bold')
plt.ylabel('排名(r)', fontsize=14, fontweight='bold')
plt.grid(True)
plt.show()

相似性度量

from nltk.metrics import *
text1 = '你好，我正在使用Python自然语言处理，有些问题正在处理,嘿嘿'

word1 = jieba.cut(text1)
word_list1 = ' '.join(word1).split(' ')
# print(word_list1)

for i in word_list1:
    result = re.match(reg,i)
    if result != None:
        word_list1.remove(i)
print(word_list)
print(word_list1)

# 准确性度量
print(accuracy(word_list,word_list1))
#　Jaccard相似系数度量
print(jaccard_distance(set(word_list),set(word_list1)))
# MASI距离度量
print(masi_distance(set(word_list),set(word_list1)))
# 二值距离度量
print(binary_distance(set(word_list),set(word_list1)))

输出：

['你好', '我', '正在', '进行', 'Python', '自然语言', '处理', '有些', '问题', '需要', '处理', '笑哈哈']
['你好', '我', '正在', '使用', 'Python', '自然语言', '处理', '有些', '问题', '正在', '处理', '嘿嘿']
0.75
0.38461538461538464
0.12692307692307692
1.0

原文地址：https://zmister.com/archives/198.html