import re import urllib from bs4 import BeautifulSoup url = http://journals.plos.org/plosone/article?id=info%3Adoi/10.1371/journal.pone.0162069 response = urllib.urlopen(url) page = response.read() soup = BeautifulSoup(page, lxml) # kill all script and style elements for s ...
将一段话中的句子分离出来不是一件容易的事。因为句子的开头和结尾并不是很规则,而且句子内部会出现句号。这使得通过单一的正则表达式分离句子是不可能的。有时你能成功,但大多数时候你会出错。这里我们用nltk模块来做。 第一部分:使用正则表达式 import re paragraph = Mr. Smith bought cheapsite.com for ...
import re from collections import Counter #define a function to print the result by line def printByLine(tuples): return( 'n'.join(' '.join(map(str,t)) for t in tuples)) #define a function to print the result alphabetically def countsSortedAlphabetically(c ...