python 3の符号化および復号化の問題encodeおよびdecode encodeおよびdecode問題によるpython 3における「r」および「rb」問題 range(50)リスト形式を返すにはlist(range(50))が必要です.
set()の|並列動作 python正規表現,reのpatternおよびmatch searchの方法import numpy as np
def loadDataSet():
postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0,1,0,1,0,1]
return postingList,classVec
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet |= set(document)
return list(vocabSet)
listOfPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOfPosts)
print(type(myVocabList))
print(myVocabList)
['posting', 'so', 'my', 'take', 'ate', 'not', 'help', 'mr', 'cute', 'please', 'quit', 'I', 'licks', 'has', 'park', 'love', 'buying', 'is', 'dalmation', 'him', 'how', 'steak', 'stupid', 'stop', 'dog', 'flea', 'problems', 'worthless', 'food', 'maybe', 'garbage', 'to']
a = set([1,2,3,1])
a
{1, 2, 3}
def setOfWord2Vec(vocablist, inputSet):
returnVec = [0] * len(vocablist)
for word in inputSet:
if word in vocablist:
returnVec[vocablist.index(word)] = 1
else:
print("the word: {0} is not in vocablist".format(word))
return returnVec
print(setOfWord2Vec(myVocabList, listOfPosts[3]))
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0]
str = 'abca'
lis = ['a','b','a']
print(str.index('a'))
print(str.index('a',1))
print(str.find('a'))
print(str.find('a',2))
print(str.rfind('a'))
print(str.rindex('a'))
print(lis.find('a'))
0
3
0
3
3
3
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
in ()
8 print(str.rindex('a'))
9 #list find
---> 10 print(lis.find('a'))
AttributeError: 'list' object has no attribute 'find'
def trainNB0(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory) / float(numTrainDocs)
p0Num = np.ones(numWords)
p1Num = np.ones(numWords)
p0Denom = 2.0
p1Denom = 2.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = p1Num/p1Denom
p0Vect = p0Num/p0Denom
return p0Vect, p1Vect, pAbusive
trainMat = []
for postinDoc in listOfPosts:
trainMat.append(setOfWord2Vec(myVocabList, postinDoc))
p0V,p1V,pAb = trainNB0(trainMat, listClasses)
print(trainMat)
print(p0V)
print(p1V)
print(pAb)
[[0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1], [0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0], [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0]]
[ 0.03846154 0.07692308 0.15384615 0.03846154 0.07692308 0.03846154
0.07692308 0.07692308 0.07692308 0.07692308 0.03846154 0.07692308
0.07692308 0.07692308 0.03846154 0.07692308 0.03846154 0.07692308
0.07692308 0.11538462 0.07692308 0.07692308 0.03846154 0.07692308
0.07692308 0.07692308 0.07692308 0.03846154 0.03846154 0.03846154
0.03846154 0.07692308]
[ 0.0952381 0.04761905 0.04761905 0.0952381 0.04761905 0.0952381
0.04761905 0.04761905 0.04761905 0.04761905 0.0952381 0.04761905
0.04761905 0.04761905 0.0952381 0.04761905 0.0952381 0.04761905
0.04761905 0.0952381 0.04761905 0.04761905 0.19047619 0.0952381
0.14285714 0.04761905 0.04761905 0.14285714 0.0952381 0.0952381
0.0952381 0.0952381 ]
0.5
pAb
0.5
def classifyNB(vec2Classify, p0Vect, p1Vect, pClass1):
p1Vec = np.log(p1Vect)
p0Vec = np.log(p0Vect)
p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
p0 = sum(vec2Classify * p0Vec) + np.log(1 - pClass1)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listOfPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOfPosts)
trainMat = []
for postInDoc in listOfPosts:
trainMat.append(setOfWord2Vec(myVocabList, postInDoc))
p0V,p1V,pAb = trainNB0(trainMat, listClasses)
testEntry1 = ['love', 'my', 'dalmation']
thisDoc1 = np.array(setOfWord2Vec(myVocabList, testEntry1))
print('{0} classified as : {1}'.format(testEntry1,classifyNB(thisDoc1, p0V, p1V, pAb)))
testEntry2 = ['stupid', 'garbage']
thisDoc2 = np.array(setOfWord2Vec(myVocabList, testEntry2))
print('{0} classified as : {1}'.format(testEntry2,classifyNB(thisDoc2, p0V, p1V, pAb)))
testingNB()
['love', 'my', 'dalmation'] classified as : 0
['stupid', 'garbage'] classified as : 1
def textParse(bigString):
import re
if bigString != None:
listOfTokens = re.split(r'\W+', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
docList = []
classList = []
fullText = []
for i in range(26):
wordList = textParse(open('email\\spam\\%d.txt' % i, 'r').read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('email\\ham\\%d.txt' % i, 'r').read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
traingSet = list(range(50))
testSet = []
for i in range(10):
randIndex = int(np.random.uniform(0, len(traingSet)))
testSet.append(traingSet[randIndex])
del(traingSet[randIndex])
traingMat = []
traingClasses = []
for docIndex in traingSet:
traingMat.append(setOfWord2Vec(vocabList, docList[docIndex]))
traingClasses.append(classList[docIndex])
p0V, p1V, pAb = trainNB0(np.array(traingMat), np.array(traingClasses))
errorCount = 0.0
for docIndex in testSet:
wordVector = setOfWord2Vec(vocabList, docList[docIndex])
if classifyNB(np.array(wordVector), p0V, p1V, pAb) != classList[docIndex]:
errorCount += 1
print('the error rate is : ', float(errorCount) / len(testSet))
spamTest()
the error rate is : 0.1
'ab'.encode('gbk')
'\0xab'.encode('gbk')
b'\x00xab'
import chardet
f = open('email\\ham\\6.txt','rb')
chardet.detect(f.read())
{'confidence': 1.0, 'encoding': 'ascii', 'language': ''}
print(list(range(50)))
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]