hpaste

recent | annotate | new

> import Data.Function
> import Data.List
> import Data.Ord
> import System.IO
> import Text.Printf

from __future__ import with_statement
from __future__ import division
import pickle

class Review(object):
    def __init__(self, movieId, grade, date):
        self.movieId = movieId
        self.grade = grade
        self.date = date

> data Review = Review { movieID :: Int, grade :: Int, date :: String } deriving (Eq, Ord, Show)

class Customer(object):
    def __init__(self, idnumber, review):
        self.idnumber = idnumber
        self.reviews = [review]

> data Customer = Customer { idnumber :: Int, reviews :: [Review] } deriving (Eq, Ord, Show)

    def addReview(self, review):
        self.reviews += review

    def averageGrade(self):
        tot = 0
        for review in self.reviews:
            tot += review.grade
        return tot / len(self.reviews)

> averageGrade :: Customer -> Double
> averageGrade (Customer { reviews = rs }) = fromIntegral (sum . map grade $ rs) / fromIntegral (length rs)

def fname(filmnbr):
    filename = '/tmp/netflix/mv_00'
    nz = 5-len(str(filmnbr))
    for x in range(0, nz):
        filename = filename + '0'
    filename = filename + str(filmnbr) + '.txt'
    return filename

> fname :: Int -> FilePath
> fname = printf "/tmp/netflix/mv_%07d.txt"

def customerInList(customer, customers):
    if customers != None:
        for el in customers:
            if customer.idnumber == el.idnumber:
                el.addReview(customer.reviews)
                return customers
    return customers + [customer]

def parseFile(filename, customers):
    with open(fname(filename)) as moviereviews:
        mov_iter = moviereviews.__iter__()
        movieId = mov_iter.next().split(":")[0]
        for line in mov_iter:
            linetext = line.split(",")
            r = Review(movieId, int(linetext[1]), linetext[2].strip("\n"))
            c = Customer(linetext[0], r)
            customers = customerInList(c, customers)
    return customers

> split :: Eq a => a -> [a] -> [[a]]
> split ch s = case span (/= ch) s of
>                (s', []) -> [s']
>                (s', s'') -> s' : split ch (tail s'')

> parse :: String -> [Customer]
> parse str = [ Customer (read . fst . head $ crs) (map snd crs) | crs <- groupBy ((==) `on` fst) . sortBy (comparing fst) $ reviews ]
>   where (header:body) = lines str
>         movID = read . takeWhile (/= ':') $ header
>         reviews = [ (cid, Review { movieID = movID, grade = read g, date = d }) | line <- body, let [cid, g, d] = split ',' line ]

> parseFile :: Int -> IO [Customer]
> parseFile = fmap parse . readFile . fname

def parseAll(start, stop):
    parse_to = []
    for xfile in range(start, stop+1):
        parse_to = parseFile(xfile, parse_to)
    return parse_to

> merge :: [Customer] -> [Customer] -> [Customer]
> merge x [] = x
> merge [] y = y
> merge (x:xs) (y:ys) | idnumber x == idnumber y = Customer { idnumber = idnumber x, reviews = reviews x ++ reviews y } : merge xs ys
>                     | idnumber x <  idnumber y = x : merge xs (y:ys)
>                     | otherwise                = y : merge (x:xs) ys

> parseAll :: Int -> Int -> IO [Customer]
> parseAll start stop = do
>   files <- mapM parseFile [start..stop]
>   return $ foldr1 merge files

parseAll(1, 5)

 # rwbarton@functor:/tmp$ grep -v '^>' netflix.lhs | time python
 # 1.97user 0.01system 0:01.98elapsed 100%CPU (0avgtext+0avgdata 0maxresident)k
 # 0inputs+0outputs (0major+1262minor)pagefaults 0swaps


> main :: IO ()
> main = do
>   x <- parseAll 1 5
>   print x

> -- rwbarton@functor:/tmp$ ghc -O2 --make netflix-parse
> -- rwbarton@functor:/tmp$ /usr/bin/time ./netflix-parse > /dev/null
> -- 0.13user 0.00system 0:00.13elapsed 100%CPU (0avgtext+0avgdata 0maxresident)k
> -- 0inputs+0outputs (0major+1437minor)pagefaults 0swaps

> -- Without any cleverness (note: no use of Data.List.sort or Ord)

> import Data.Function
> import Data.Ord
> import System.IO
> import Text.Printf

from __future__ import with_statement
from __future__ import division
import pickle

class Review(object):
    def __init__(self, movieId, grade, date):
        self.movieId = movieId
        self.grade = grade
        self.date = date

> data Review = Review { movieID :: Int, grade :: Int, date :: String } deriving (Eq, Show)

class Customer(object):
    def __init__(self, idnumber, review):
        self.idnumber = idnumber
        self.reviews = [review]

> data Customer = Customer { idnumber :: Int, reviews :: [Review] } deriving (Eq, Show)

    def addReview(self, review):
        self.reviews += review

def fname(filmnbr):
    filename = '/tmp/netflix/mv_00'
    nz = 5-len(str(filmnbr))
    for x in range(0, nz):
        filename = filename + '0'
    filename = filename + str(filmnbr) + '.txt'
    return filename

> fname :: Int -> FilePath
> fname = printf "/tmp/netflix/mv_%07d.txt"

def customerInList(customer, customers):
    if customers != None:
        for el in customers:
            if customer.idnumber == el.idnumber:
                el.addReview(customer.reviews)
                return customers
    return customers + [customer]

> updateCustomer :: Customer -> [Customer] -> [Customer]
> updateCustomer x [] = [x]
> updateCustomer x (w:ws) | idnumber x == idnumber w = Customer { idnumber = idnumber x, reviews = reviews x ++ reviews w } : ws
>                         | otherwise                = w : updateCustomer x ws

def parseFile(filename, customers):
    with open(fname(filename)) as moviereviews:
        mov_iter = moviereviews.__iter__()
        movieId = mov_iter.next().split(":")[0]
        for line in mov_iter:
            linetext = line.split(",")
            r = Review(movieId, int(linetext[1]), linetext[2].strip("\n"))
            c = Customer(linetext[0], r)
            customers = customerInList(c, customers)
    return customers

> split :: Eq a => a -> [a] -> [[a]]
> split ch s = case span (/= ch) s of
>                (s', []) -> [s']
>                (s', s'') -> s' : split ch (tail s'')

> parse :: String -> [Customer]
> parse str = foldr updateCustomer [] reviews
>   where (header:body) = lines str
>         movID = read . takeWhile (/= ':') $ header
>         reviews = [ Customer (read cid) [Review { movieID = movID, grade = read g, date = d }] | line <- body, let [cid, g, d] = split ',' line ]

> parseFile :: Int -> IO [Customer]
> parseFile = fmap parse . readFile . fname

def parseAll(start, stop):
    parse_to = []
    for xfile in range(start, stop+1):
        parse_to = parseFile(xfile, parse_to)
    return parse_to

> mergeQuadratic :: [Customer] -> [Customer] -> [Customer]
> mergeQuadratic [] y = y
> mergeQuadratic (x:xs) y = updateCustomer x z
>   where z = mergeQuadratic xs y

> parseAll :: Int -> Int -> IO [Customer]
> parseAll start stop = do
>   files <- mapM parseFile [start..stop]
>   return $ foldr1 mergeQuadratic files

parseAll(1, 5)

 # rwbarton@functor:/tmp$ grep -v '^>' netflix.lhs | time python
 # 1.97user 0.01system 0:01.98elapsed 100%CPU (0avgtext+0avgdata 0maxresident)k
 # 0inputs+0outputs (0major+1262minor)pagefaults 0swaps


> main :: IO ()
> main = do
>   x <- parseAll 1 5
>   print x

> -- rwbarton@functor:/tmp$ ghc -O2 --make netflix-parse
> -- rwbarton@functor:/tmp$ /usr/bin/time ./netflix-parse > /dev/null
> -- 0.37user 0.01system 0:00.38elapsed 100%CPU (0avgtext+0avgdata 0maxresident)k
> -- 0inputs+0outputs (0major+1657minor)pagefaults 0swaps