Python发送WEB请求,并对WEB内容进行解析

    技术2022-05-14  1

    import os,sys import urllib2 import gzip import StringIO from datetime import datetime #get relate keywords by product subject def getRelateKeywords(requestUrl):     url = requestUrl.replace(' ', ' ');     page_encode = "gbk"         request = urllib2.Request(url)     request.add_header("Accept-encoding", "gzip")         usock = urllib2.urlopen(request)     page = usock.read()         if usock.headers.get('content-encoding', None) == 'gzip':         page = gzip.GzipFile(fileobj=StringIO.StringIO(page)).read()     if not isinstance(page, unicode):         page = unicode(page, page_encode)     #print(page)         #parse the xml file     page = page[page.find("<![CDATA[{/"en_skw/":{"):]     relateKeywords = page[:page.find("</field>")]     return relateKeywords #end of getRelateKeywords #get the result and write it to the file def writeResultToFile(productSubjectRelKw, targetFilePath):     fileHandler = open(targetFilePath, 'a')     fileHandler.write(productSubjectRelKw + '/n')     fileHandler.close() #end of writeResultToFile  #split the skw result def getSplitSkwResult(relateKeywords, keywordsType):     if keywordsType == 'hot' and relateKeywords.find("/"hot/":[/"") > 0:         relateKeywords = relateKeywords[relateKeywords.find("/"hot/":[/"") + 8:]         hotKeywordsList = relateKeywords[:relateKeywords.find("/"]")]         keywordsList = hotKeywordsList     elif keywordsType == 'blue' and relateKeywords.find("/"blue/":[/"") > 0:         relateKeywords = relateKeywords[relateKeywords.find("/"blue/":[/"") + 9:]         blueKeywordsList = relateKeywords[:relateKeywords.find("/"]}}  ]]>") - 6]         keywordsList = blueKeywordsList     else:         keywordsList = ''             relateKeywords = ''        return keywordsList #end of getSplitSkwResult #get result def getUrlResult(srcFilePath, targetFilePath):     lineCount = 0     fileHandler = open(srcFilePath, 'r')         #print 'list all lines'     fileHandler.seek(0)     textlist = fileHandler.readlines()     for line in textlist:         productId = line[0: line.find(",") + 1]         productSubject = line[line.find(",") + 1:]                 requestUrl = 'http://10.20.137.17:30008/bin/smartquery?query='+productSubject[:-1]+'&resconfig=skw'         relateKeywords = getRelateKeywords(requestUrl)         lineCount = lineCount + 1                 if lineCount0 == 0:             print lineCount         #split the relate keywords         hotKeywordsList = getSplitSkwResult(relateKeywords, 'hot')         #print '---------------------hotKeywordsList-------------------'         rank = 0;         if len(hotKeywordsList.strip()) <> 0:             for hotKeyword in hotKeywordsList.split('","'):                 rank = rank + 1                 productSubjectRelKw = productId[:-1] + '||' + productSubject[:-1] + '||' + hotKeyword + '||' + str(rank) + '||' + 'hot'                 writeResultToFile(productSubjectRelKw, targetFilePath)                 blueKeywordsList = getSplitSkwResult(relateKeywords, 'blue')         rank = 0         #print '---------------------blueKeywordsList--------------------'         if len(blueKeywordsList.strip()) <> 0:             for blueKeyword in blueKeywordsList.split('","'):                 rank = rank + 1                 productSubjectRelKw = productId[:-1] + '||' + productSubject[:-1] + '||' + blueKeyword + '||' + str(rank) + '||' + 'blue'                 writeResultToFile(productSubjectRelKw, targetFilePath)                 fileHandler.close()         return lineCount #end of getUrlResult def main():     #initialize     start_time = datetime.now()         srcFilePath = sys.argv[1];       #input file     targetFilePath = sys.argv[2];    #output file         count_total = getUrlResult(srcFilePath, targetFilePath)         end_time = datetime.now()     print "/n=================="     print "Time total used :  ", ( end_time - start_time )     print "Total: %s," % (count_total)     print "==================" #end of main if __name__ == '__main__':     main()


    最新回复(0)