from BeautifulSoup import *
import urllib2
import urlparse
import re
import operator
from operator import itemgetter

index={}
coltermscount=0.0

def search(q):
    rankedList=[]
    global coltermscount

    for k,v in index.items():
        for  k1,v2 in v.items():            
            coltermscount += v2   
    
    for d in index.keys():        
        rankedList.append(LM(q,d))   

    index1 = operator.itemgetter(1)
    rankedList.sort(key=index1, reverse=True)

    print "Result:"
    for item in rankedList:
        print item

def LM(q,d, alpha=0.5):
    doctermscount=0.0
            
    for k,v in index[d].items():
        doctermscount += v
        
    queryterms = getwords(q)
    Pq=1.0
    
    for t in queryterms:
        try:
            tf= index[d][t]
        except:
            tf=0.0
        Pq *= ((1-alpha)*tf/doctermscount + (alpha) * colcount(t)/coltermscount)        
    return (d,Pq)

def colcount(t):
    count=0
    for k,v in index.items():
        try:
            count += index[k][t]
        except:
            count +=0            
    return count

def crawl(urls):    
    for url in urls:
        print "Indexing %s" % (url)
        index.setdefault(url,readpage(url))
        
    q=raw_input("Enter Search Query: ")
    search(q)       

def readpage(page):
    c= urllib2.urlopen(page)
    soup=BeautifulSoup(c.read())
    text= gettextonly(soup)
    words= getwords(text)
    wordcount= getwordcount(words)

    return wordcount
    
def gettextonly(soup):
    v= soup.string
    if v==None:
        content=soup.contents
        resulttext=''
        for t in content:            
            subtext=gettextonly(t)
            resulttext+=subtext+"\n"
        return resulttext
    else:
        return v.strip()

def getwords(text):
    splitter=re.compile('\\W*')
    return [s.lower() for s in splitter.split(text) if len(s)>0]

def getwordcount(words):
    wordcount={}
    for w in words:
        wordcount.setdefault(w,0)
        wordcount[w]+=1
    return wordcount

