To keep the amount of photos in my photo library sane, I had to sift through many pictures and get rid of redundant ones. I defined redundancy as many pictures taken at the same time. Thus I had to pick one of the redundant pictures and delete the other ones.
My strategy so far was to use Nautilus and Eye of GNOME to spot pictures of the same group and delete all but the best one.
I realised that photos usually show the same picture if they were shot at the same time, i.e. many quick shots after another. I also realised that usually the best photograph was the biggest one in terms on bytes in JPEG format.
To automate the whole selection and deletion process, I hacked together a tiny script that stupidly groups files in a directory according to their mtime and deletes all but the biggest one.
Before deletion, it will show the pictures with eog
and ask whether or not to delete the other pictures.
It worked quite well and helped to quickly weed out 15% of my pictures 🙂
I played around with another method: Getting the difference of the histograms of the images, to compare the similarity. But as the pictures were shot with a different exposure, the histograms were quite different, too. Hence that didn’t work out very well. But I’ll leave it in, just for reference.
So if you happen to have a similar problem, feel free to grab the following script 🙂
#!/usr/bin/env python
import collections
import math
import os
from os.path import join, getsize, getmtime
import operator
import subprocess
import sys
subprocess.Popen.__enter__ = lambda self: self
subprocess.Popen.__exit__ = lambda self, type, value, traceback: self.kill()
directory = '.'
THRESHOLD = 3
GET_RMS = False
mtimes = collections.defaultdict(list)
def get_picgroups_by_time(directory='.'):
for root, dirs, files in os.walk(directory):
for name in files:
fname = join(root, name)
mtime = getmtime(fname)
mtimes[mtime].append(fname)
# It's gotten a bit messy, but a OrderedDict is available in Python 3.1 hence this is the manually created ordered list.
picgroups = [v for (k, v) in sorted([(k, v) for k, v in mtimes.iteritems() if len(v) >= THRESHOLD])]
return picgroups
def get_picgroups(directory='.'):
return get_picgroups_by_time()
picgroups = get_picgroups(directory)
print 'Got %d groups' % len(picgroups)
def get_max_and_picgroups(picgroups):
for picgroup in picgroups:
max_of_group = max(picgroup, key=lambda x: getsize(x))
print picgroup
print 'max: %s: %d' % (max_of_group, getsize(max_of_group))
if GET_RMS:
import PIL.Image
last_pic = picgroup[0]
for pic in picgroup[1:]:
image1 = PIL.Image.open(last_pic).histogram()
image2 = PIL.Image.open(pic).histogram()
rms = math.sqrt(reduce(operator.add, map(lambda a,b: (a-b)**2, image1, image2))/len(image1))
print 'RMS %s %s: %s' % (last_pic, pic, rms)
last_pic = pic
yield (max_of_group, picgroup)
max_and_picgroups = get_max_and_picgroups(picgroups)
def decide(prompt, decisions):
import termios, fcntl, sys, os, select
fd = sys.stdin.fileno()
oldterm = termios.tcgetattr(fd)
newattr = oldterm[:]
newattr[3] = newattr[3] & ~termios.ICANON & ~termios.ECHO
termios.tcsetattr(fd, termios.TCSANOW, newattr)
oldflags = fcntl.fcntl(fd, fcntl.F_GETFL)
fcntl.fcntl(fd, fcntl.F_SETFL, oldflags | os.O_NONBLOCK)
print prompt
decided = None
try:
while not decided:
r, w, e = select.select([fd], [], [])
if r:
c = sys.stdin.read(1)
print "Got character", repr(c)
decision_made = decisions.get(c, None)
if decision_made:
decision_made()
decided = True
finally:
termios.tcsetattr(fd, termios.TCSAFLUSH, oldterm)
fcntl.fcntl(fd, fcntl.F_SETFL, oldflags)
for max_of_group, picgroup in max_and_picgroups:
cmd = ['eog', '-n'] + picgroup
print 'Showing %s' % ', '.join(picgroup)
def delete_others():
to_delete = picgroup[:]
to_delete.remove(max_of_group)
print 'deleting %s' % ', '.join (to_delete)
[os.unlink(f) for f in to_delete]
with subprocess.Popen(cmd) as p:
decide('%s is max, delete others?' % max_of_group, {'y': delete_others, 'n': lambda: ''}) |
#!/usr/bin/env python
import collections
import math
import os
from os.path import join, getsize, getmtime
import operator
import subprocess
import sys
subprocess.Popen.__enter__ = lambda self: self
subprocess.Popen.__exit__ = lambda self, type, value, traceback: self.kill()
directory = '.'
THRESHOLD = 3
GET_RMS = False
mtimes = collections.defaultdict(list)
def get_picgroups_by_time(directory='.'):
for root, dirs, files in os.walk(directory):
for name in files:
fname = join(root, name)
mtime = getmtime(fname)
mtimes[mtime].append(fname)
# It's gotten a bit messy, but a OrderedDict is available in Python 3.1 hence this is the manually created ordered list.
picgroups = [v for (k, v) in sorted([(k, v) for k, v in mtimes.iteritems() if len(v) >= THRESHOLD])]
return picgroups
def get_picgroups(directory='.'):
return get_picgroups_by_time()
picgroups = get_picgroups(directory)
print 'Got %d groups' % len(picgroups)
def get_max_and_picgroups(picgroups):
for picgroup in picgroups:
max_of_group = max(picgroup, key=lambda x: getsize(x))
print picgroup
print 'max: %s: %d' % (max_of_group, getsize(max_of_group))
if GET_RMS:
import PIL.Image
last_pic = picgroup[0]
for pic in picgroup[1:]:
image1 = PIL.Image.open(last_pic).histogram()
image2 = PIL.Image.open(pic).histogram()
rms = math.sqrt(reduce(operator.add, map(lambda a,b: (a-b)**2, image1, image2))/len(image1))
print 'RMS %s %s: %s' % (last_pic, pic, rms)
last_pic = pic
yield (max_of_group, picgroup)
max_and_picgroups = get_max_and_picgroups(picgroups)
def decide(prompt, decisions):
import termios, fcntl, sys, os, select
fd = sys.stdin.fileno()
oldterm = termios.tcgetattr(fd)
newattr = oldterm[:]
newattr[3] = newattr[3] & ~termios.ICANON & ~termios.ECHO
termios.tcsetattr(fd, termios.TCSANOW, newattr)
oldflags = fcntl.fcntl(fd, fcntl.F_GETFL)
fcntl.fcntl(fd, fcntl.F_SETFL, oldflags | os.O_NONBLOCK)
print prompt
decided = None
try:
while not decided:
r, w, e = select.select([fd], [], [])
if r:
c = sys.stdin.read(1)
print "Got character", repr(c)
decision_made = decisions.get(c, None)
if decision_made:
decision_made()
decided = True
finally:
termios.tcsetattr(fd, termios.TCSAFLUSH, oldterm)
fcntl.fcntl(fd, fcntl.F_SETFL, oldflags)
for max_of_group, picgroup in max_and_picgroups:
cmd = ['eog', '-n'] + picgroup
print 'Showing %s' % ', '.join(picgroup)
def delete_others():
to_delete = picgroup[:]
to_delete.remove(max_of_group)
print 'deleting %s' % ', '.join (to_delete)
[os.unlink(f) for f in to_delete]
with subprocess.Popen(cmd) as p:
decide('%s is max, delete others?' % max_of_group, {'y': delete_others, 'n': lambda: ''})