import os,sys import urllib2 import gzip import StringIO from datetime import datetime #get relate keywords by product subject def getRelateKeywords(requestUrl): url = requestUrl.replace(' ', ' '); page_encode = "gbk" request = urllib2.Request(url) request.add_header("Accept-encoding", "gzip") usock = urllib2.urlopen(request) page = usock.read() if usock.headers.get('content-encoding', None) == 'gzip': page = gzip.GzipFile(fileobj=StringIO.StringIO(page)).read() if not isinstance(page, unicode): page = unicode(page, page_encode) #print(page) #parse the xml file page = page[page.find("<![CDATA[{/"en_skw/":{"):] relateKeywords = page[:page.find("</field>")] return relateKeywords #end of getRelateKeywords #get the result and write it to the file def writeResultToFile(productSubjectRelKw, targetFilePath): fileHandler = open(targetFilePath, 'a') fileHandler.write(productSubjectRelKw + '/n') fileHandler.close() #end of writeResultToFile #split the skw result def getSplitSkwResult(relateKeywords, keywordsType): if keywordsType == 'hot' and relateKeywords.find("/"hot/":[/"") > 0: relateKeywords = relateKeywords[relateKeywords.find("/"hot/":[/"") + 8:] hotKeywordsList = relateKeywords[:relateKeywords.find("/"]")] keywordsList = hotKeywordsList elif keywordsType == 'blue' and relateKeywords.find("/"blue/":[/"") > 0: relateKeywords = relateKeywords[relateKeywords.find("/"blue/":[/"") + 9:] blueKeywordsList = relateKeywords[:relateKeywords.find("/"]}} ]]>") - 6] keywordsList = blueKeywordsList else: keywordsList = '' relateKeywords = '' return keywordsList #end of getSplitSkwResult #get result def getUrlResult(srcFilePath, targetFilePath): lineCount = 0 fileHandler = open(srcFilePath, 'r') #print 'list all lines' fileHandler.seek(0) textlist = fileHandler.readlines() for line in textlist: productId = line[0: line.find(",") + 1] productSubject = line[line.find(",") + 1:] requestUrl = 'http://10.20.137.17:30008/bin/smartquery?query='+productSubject[:-1]+'&resconfig=skw' relateKeywords = getRelateKeywords(requestUrl) lineCount = lineCount + 1 if lineCount0 == 0: print lineCount #split the relate keywords hotKeywordsList = getSplitSkwResult(relateKeywords, 'hot') #print '---------------------hotKeywordsList-------------------' rank = 0; if len(hotKeywordsList.strip()) <> 0: for hotKeyword in hotKeywordsList.split('","'): rank = rank + 1 productSubjectRelKw = productId[:-1] + '||' + productSubject[:-1] + '||' + hotKeyword + '||' + str(rank) + '||' + 'hot' writeResultToFile(productSubjectRelKw, targetFilePath) blueKeywordsList = getSplitSkwResult(relateKeywords, 'blue') rank = 0 #print '---------------------blueKeywordsList--------------------' if len(blueKeywordsList.strip()) <> 0: for blueKeyword in blueKeywordsList.split('","'): rank = rank + 1 productSubjectRelKw = productId[:-1] + '||' + productSubject[:-1] + '||' + blueKeyword + '||' + str(rank) + '||' + 'blue' writeResultToFile(productSubjectRelKw, targetFilePath) fileHandler.close() return lineCount #end of getUrlResult def main(): #initialize start_time = datetime.now() srcFilePath = sys.argv[1]; #input file targetFilePath = sys.argv[2]; #output file count_total = getUrlResult(srcFilePath, targetFilePath) end_time = datetime.now() print "/n==================" print "Time total used : ", ( end_time - start_time ) print "Total: %s," % (count_total) print "==================" #end of main if __name__ == '__main__': main()