-
Notifications
You must be signed in to change notification settings - Fork 0
/
helpers_moodbar.py
465 lines (389 loc) · 19.6 KB
/
helpers_moodbar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
''' This was an experiment in moodbar, adding equal loudness and critical bands.
I may implement it in something more portable once I'm satisfied. I may be too lazy.
Mono 16-bit is hardcoded in places. Sample rate could be more easily changed.
Two-process and takes more than one core, but less than two fully because ffmpeg is held up by our calculations.
TODO:
- see if pydub makes everything better. Which is also using ffmpeg, but very likely more mature than my code.
- check for presence of ffprobe and ffmpeg before running
- see whether https://github.com/spezifisch/pymoodbar is better than this in most ways.
CONSIDER: use a power-of-two sample rate, for FFT speed, if ffmpeg allows it.
'''
import os
import array
import math
import numpy
import numpy.fft
import scipy
import scipy.ndimage
import helpers_ffmpeg
### dB(B) coorection (specifically that bcause meant for intermediate loudness. Sure it's not really for music, but easy)
_dbb_lut_db = [] # index is hz, elements are dB
_dbb_lut_f = [] # index is hz, elements are factor
def _dbb_genlut(uptohz=22050):
''' Returns dB adjustment according to dB(B) filter at a given (integer) frequency
(returned values max out at about -0.20dB) '''
_dbb_lut_db.append( 0 ) # 0 for 0Hz (DC)
_dbb_lut_f.append( 0 )
for hz in range(1,uptohz+1):
f2 = float(hz)**2
f3 = float(hz)**3
a = 148840000.0 # 12200.**2
b = 424.36 # 20.6**2
c = 25122.25 # 158.5**2
Rb= (a * f3 )/( (f2+b)*(f2+a)*math.sqrt(f2+c) )
db_adjust = 0.17 + 20.*math.log(Rb)
factor_adjust = 10.**(0.05*db_adjust)
_dbb_lut_db.append( db_adjust )
_dbb_lut_f.append( factor_adjust )
def dbb_db(hz):
' this function assumes the lookuptable has been generated; this module calls that at module scope (i.e. happens at import time) '
if len(_dbb_lut_db)==0:
_dbb_genlut()
hz=int(hz)
return _dbb_lut_db[hz]
def dbb_factor(hz):
' this function assumes the lookuptable has been generated; this module calls that at module scope (i.e. happens at import time) '
if len(_dbb_lut_f)==0:
_dbb_genlut()
hz=int(hz)
return _dbb_lut_f[hz]
### Frequency-to-bark, to accumulate per bark band
# The usual implementation is just a few explicit thresholds.
# This lookup table is lazy and overkill, but was handy while comparing critical band functions...
def bark_traunmuller(hz, mn=0, mx=23):
''' Critical band rate (Hz->Bark), according to Traunmuller 1990.
The result is truncated to 0..23 (inclusive) by default
in part because the formula itself returns <0 for <40Hz, '''
v = (26.81*float(hz))/(1960.+float(hz)) - 0.53
v = max(v, mn)
v = min(v, mx)
return int(v)
_bark_lut = []
def _bark_genlut(uptohz=22050):
for hz in range(uptohz+1):
_bark_lut.append( bark_traunmuller(hz) )
def bark(hz):
''' functionally the same as bark_traunmuller() but via a lookup table that precalculates the value for every whole-hz number. '''
if len(_bark_lut)==0:
_bark_genlut()
hz = int(hz)
return _bark_lut[hz]
#### Decoding and helpers ##########################
def covering_windows(ary, window_size, min_overlap):
''' Returns a bunch of windows of a given size, from a possibly-larger array. Made for time series, so 1D.
In moodbar context primarily to be able to work on 1/1000th at a time, but not FFT tha all at once
because that's terrible when a track is on the order of an hour (that'd be ~200ksamples).
Instead, we'd want to take much smaller windows however many it takes to cover that
and average their results.
In addition to using all all samples, while guaranteeing a minimum overlap (e.g. for FFT windowing reasons)
In other words, find the smallest amount of steps that has at least that amount of overlap.
min_overlap > 1 is interpreted as amount of samples,
min_overlap in 0..1 is interpreted as a fraction of window_size
Note that for FFT and window-function reasons, more than ~0.3 is overly expensive
and more than 0.5 doesn't make sense
Note that since this assumes all data must be looked at,
we are biased to sometimes choose a rather higher overlap than requested,
particularly if the window_size isn't much smaller than the input size
e.g. 4096 with window_size 512 and min_overlap 0 is 8 steps of size 512 (0 overlap)
4096 with window_size 512 and min_overlap 1 is 9 steps of size 448 (64.0 overlap)
4096 with window_size 512 and min_overlap 128 is 11 steps of size 358.4 (153.6 overlap)
'''
w, = ary.shape
if w < window_size: # TODO: think about this more.
yield ary
else:
fmi = float(min_overlap)
if fmi>0.0 and fmi<1.0:
min_overlap = int(math.ceil(fmi*window_size))
#print "min_overlap from float (%.3f * %d) means %d "%(fmi, window_size, min_overlap)
# brute force this, I'm too lazy to think about it right now - though I expect it's just a sub and div. Ahem :)
wsteps = None
for numsteps in range( int( max(2,w / window_size)), int(w/2) ): # sane limits
st = numpy.linspace( window_size, w, numsteps )
stepsize = st[1]-st[0]
overlap = window_size-stepsize
if overlap >= min_overlap:
#print "Dividing width %s into %d steps (stepsize is %.1f) gives %.1f overlap"%(w,numsteps, stepsize,overlap)
wsteps = numsteps
break
for xoff in numpy.linspace(0, w-window_size, wsteps):
fromx = int(xoff)
tox = int(math.floor(fromx+window_size))
#print 'x',xoff, ' ', fromx, '..', tox, tox-fromx
yield ary[fromx:tox]
class DecodeError(Exception):
' custom exception, primarily for readability'
pass
###
_hcache = {}
def hanning(size):
' cache because 99%+ of these will be the same, and occasionally other sizes '
if size in _hcache:
return _hcache[size]
else:
ret = numpy.hanning(size)
_hcache[size]=ret
return ret
def make_mood(mediafilename): # , debug=False
'''Given a media filename (probably mp3, ogg, or such)
What it does:
- get length (using ffprobe) (to be able to do the second point and third points streaming-style)
- split into 1000 equal-sized chunks
- stream-decode file (using ffmpeg) (should take ~1sec of CPU per 4MB of MP4)
- FFT each chunk
- reweigh buckets for human perception:
- sum into Bark buckets (so non-linear)
- db(B) to adjust bucket amplitudes for perceptive
Returns:
- a 1000x24 uint8 numpy matrix (bark bands)
- a 1000x3 uint8 numpy matrix (moodbar RGBs)
(You could save these to binary data with ary.tostring())
...or None,None if it decides it can't.
TODO:
- optimize, once I've played with and settled on all the weighing
- deal better with few-second files
'''
sample_rate = 22050
# TODO: figure out cost/benefit against FFT speed (resampling to 22050 adds maybe 30% on top of decode calculations.)
# TODO: see if ffmpeg can output arbitrary sample rates
estlength_sec = helpers_ffmpeg.get_length(mediafilename)#, decode=False)
nsamples = estlength_sec * sample_rate
# Note that since our use is just Bark bands, we can get away with 1024 or 2048
#fudge things for very short media
if nsamples < 128000: # ..5sec
return None,None
#overlapsize = 8
#fftsize = 64
elif nsamples < 256000: # ..11sec
overlapsize = 16
fftsize = 128
elif nsamples < 512000: # ..23 secs
overlapsize = 32
fftsize = 256
elif nsamples < 1024000: # ..46 secs
overlapsize = 32
fftsize = 512
else: # most cases
overlapsize = 64
fftsize = 1024
bucketsize = int( 1+fftsize/2 )
chunklen_samples = int( nsamples / 1000. )
# note: int will truncate so this will add up time-error-wise. right now I choose not to care, but
# TODO: rewrite the logic to care.
#print( 'Length estimate (ffprobe): ~%.3f seconds, ~%d samples'%( estlength_sec, nsamples) )
#print( "Which would be ~%d samples @ %dHz"%(estlength_sec * sample_rate, sample_rate) )
#print( "Each 1/1000-length chunk is ~%d samples"%(chunklen_samples) )
# our first goal is to sum into bark-bands per 1000th-length segment
bark_ary = numpy.zeros( (24,1000), dtype=numpy.float32 )
try:
chunksample_gen = helpers_ffmpeg.stream_audio(mediafilename, sample_rate, chunk_samples=chunklen_samples) # generator
samplepos = 0 # keep track of how much data we saw
for i, chunksamples in enumerate(chunksample_gen):
if i==1000: # TODO: remove the need for this? (it's because we under-read a little, see chunklen_samples)
break
if len(chunksamples)==0: # TODO: remove the need for this?
break
#print( "chunk %s gets samples %d..%d (of %d)"%(i, samplepos, samplepos+len(chunksamples), nsamples,) ) # for debug of that time error
samplepos += chunksamples.shape[0]
#at_seconds = float(samplepos) / sample_rate
#sys.stdout.write( "at %dm%02ds (of ~%dm%02ds) in %r\n"%( at_seconds/60, at_seconds%60, estlength_sec/60, estlength_sec%60, os.path.basename(mediafilename) ))
#sys.stdout.flush()
# constants given the above
approx_width_hz = float( sample_rate/2. )/float( fftsize/2 )
barkbuckets = numpy.zeros(bucketsize, dtype=numpy.uint8)
factoradjusts = numpy.zeros(bucketsize, dtype=numpy.float32)
for fi in range(bucketsize):
approx_center = (0.5+fi)*approx_width_hz
barkbuckets[fi] = bark(approx_center)
factoradjusts[fi] = dbb_factor( approx_center )
for windowsamples in covering_windows(chunksamples, fftsize, overlapsize):
#if len(windowsamples)==0: # TODO: remove the need. This shouldn't happen?
# raise ValueError('bug in code, covering_windows outputted something %d long'%len(windowsamples))
# TODO: pad if smaller because we baked in size assumptions - length of the last chunk will typically be shorter
windowsamples *= hanning( len(windowsamples) )
#print( "chunk %d window %d: %d samples"%(i,wi, len(windowsamples)) )
ft = numpy.fft.rfft( windowsamples )
amp = numpy.abs( ft ) # amplitude spectrum
#pwr = amp**2 # PSD gives more contrasty spectrograms, but I like there being more to color.
amp /= fftsize # normalize
# reminder: bark_ary is a 24-by-1000-sized thing,
# amp is 1+fftsize/2 sized to be mapped into 24 via barkbuckets, and summed into bark_ary
#barkbuckets looks something like 0 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 3 4 4 4 4 4 5 5 5 5 5 6 6 6 6 6 6 6
# 7 7 7 7 7 7 8 8 8 8 8 8 8 8 9 9 9 9 9 9 9 9 9 etc
# so this, while not the most efficient, is understandable and brief
values, counts = numpy.unique(barkbuckets, return_counts=True)
for bbuck,cnt in zip( list(values), list(counts) ):
try:
bark_ary[bbuck][i] = amp[barkbuckets==bbuck].sum()/cnt
except Exception as e:
pass
for dump in chunksample_gen: # TODO: remove the need
pass
except DecodeError: # ...this no longer looks right, TODO: look
at_seconds = float(samplepos) / sample_rate
#print( "Decode error at %dm%02ds (of %dm%02ds)"%(at_seconds/60, at_seconds%60, estlength_sec/60, estlength_sec%60) )
diffsec = abs(at_seconds-estlength_sec)
if diffsec > 3:
print( "Decode before end, difference is %.2f seconds"%diffsec)
raise
else:
print( "Decode error at end (%.1f sec difference to estimated length) - small difference, probably something like stray APEv2, fine."%diffsec )
# CONSIDER: a fixed-dB range instead of either of these.
#import pandas, matplotlib.pyplot as plt
#print( pandas.DataFrame(data=mood_ary).T.describe())
#plt.imshow(bark_ary, cmap='gray', vmin=0, vmax=255)
#plt.show()
### Generating a more classic RGB moodbar just means summing 24 bark_ary rows into 3 somehow.
# there's a shorter way of doing this, which I'll do when I'm done tweaking
colorweight = (5.0, 5.0, 4.0, 3.0, # lo (red)
2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0, # mid (blue)
1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,3.0,3.0,5.0) # hi (green)
mood_ary = numpy.zeros( (3,1000), dtype=numpy.float32 )
mood_ary[0] += bark_ary[0]*colorweight[0] # ~50Hz
mood_ary[0] += bark_ary[1]*colorweight[1] # 150
mood_ary[1] += bark_ary[2]*colorweight[2] # 250 #bassy ends hereish
mood_ary[1] += bark_ary[3]*colorweight[3] # 350
mood_ary[1] += bark_ary[4]*colorweight[4] # 450
mood_ary[1] += bark_ary[5]*colorweight[5] # 550
mood_ary[1] += bark_ary[6]*colorweight[6] # 650
mood_ary[1] += bark_ary[7]*colorweight[7] # 700
mood_ary[1] += bark_ary[8]*colorweight[8] # 850
mood_ary[1] += bark_ary[9]*colorweight[9] # 1000
mood_ary[2] += bark_ary[10]*colorweight[10] # 1150
mood_ary[2] += bark_ary[11]*colorweight[11] # 1350
mood_ary[2] += bark_ary[12]*colorweight[12] # 1600
mood_ary[2] += bark_ary[13]*colorweight[13] # 2100
mood_ary[2] += bark_ary[14]*colorweight[14] # 2500
mood_ary[2] += bark_ary[15]*colorweight[15] # 2900
mood_ary[2] += bark_ary[16]*colorweight[16] # 3400
mood_ary[2] += bark_ary[17]*colorweight[17] # 4000
mood_ary[2] += bark_ary[18]*colorweight[18] # 4800
mood_ary[2] += bark_ary[19]*colorweight[19] # 5800
mood_ary[2] += bark_ary[20]*colorweight[20] # 7000
mood_ary[2] += bark_ary[21]*colorweight[21] # 8500
mood_ary[2] += bark_ary[22]*colorweight[22] # 10500
mood_ary[2] += bark_ary[23]*colorweight[23] # 13500
mood_ary[0] *= 0.12
mood_ary[1] *= 0.18
mood_ary[2] *= 0.90
#bark_ary = scipy.ndimage.maximum_filter(bark_ary, size=(1,4))
bark_ary = scipy.ndimage.percentile_filter(bark_ary, 70, size=(5,2)) # make it seem more blocky, but still retain some spectrogram
#mood_ary = scipy.ndimage.gaussian_filter(mood_ary, sigma=(0.6,0.2))
mood_ary[mood_ary<0]=0
mood_ary[mood_ary>255]=255
# make spectral things more visible (after moodbar thing, to not affect color)
mx = numpy.max(bark_ary)
bark_ary /= mx
bark_ary **= 0.35
bark_ary *= mx
bark_ary[bark_ary<0] = 0
bark_ary[bark_ary>255] = 255
return bark_ary.T.astype(numpy.uint8), mood_ary.T.astype(numpy.uint8)
def read_mood(filename):
''' Reads a .mood file into one flat array, which will probably be 3000 items long '''
#stob = os.stat(filename)
f = open(filename,'rb')
data = f.read()
data = array.array('B',data).tolist()
f.close()
return data
def mood_image(filename, height=20, width=None):
''' Given the path to a standard 3-band .mood image, will generate a PIL image for it.
One color per width (rgb mix, stretched up to height)
width by the file size (which in practice should always be 1000 but we're resistant to others)
You can rescale to a new width (interpolated) by supplying one. Default is not to do so.
(TODO: be resistant against ridiculous widths from both file size and from resize request)
'''
from PIL import Image, ImageFilter
data = read_mood(filename)
oo256 = 1./256.
numsamples = int(len(data)/3)
img = Image.new('RGB',(numsamples,1) )
for i in range(0,len(data),3):
r,g,b = data[i:i+3]
r = int( ((r*oo256)**0.57)*255 )
g = int( ((g*oo256)**1.2)*255 )
b = int( ((b*oo256)**1.2)*255 )
img.putpixel( (int(i/3),0), (r,g,b) )
newsize = list(img.size)
if height:
newsize[1]=height
if width:
newsize[0]=width
newsize = tuple(newsize)
if img.size != newsize:
#print newsize
if newsize[0] < 1000: # if scaling down, blur first. CONSIDER: do this in other methods too?
#print "BLUR %.2f"%(1000./newsize[0])
img = img.filter( ImageFilter.BoxBlur( 300./newsize[0] ) )
img = img.resize( newsize )
return img
def mood_image3(filename, height=21, width=None):
''' like mood_image, but shows three bands, with the r, g, b separately. Was partly a debug thing.
'''
from PIL import Image#, ImageFilter
# could play with things like horizonal median?
data = read_mood(filename)
#oo256 = 1./256.
numsamples = len(data)/3
img = Image.new('RGB',(numsamples,3) )
for i in range(0,len(data),3):
r,g,b = data[i:i+3]
img.putpixel( (i/3,0), (r,0,0) )
img.putpixel( (i/3,1), (0,g,0) )
img.putpixel( (i/3,2), (0,0,b) )
newsize = list(img.size)
if height:
newsize[1] = height
if width:
newsize[0] = width
newsize = tuple(newsize)
if img.size != newsize:
img = img.resize( newsize, Image.NEAREST ) # assuming it's just vertical, this may be a little faster- but CONSIDER BILINEAR is more generic?
return img
def fancy_image(barkary, moodary):
''' Own experiment mixing moodbar's colors with the bark spectrogram we made
Takes the output pair from make_mood:
- the 24-band bark version while we still have it,
- the more typical 3-band one we could read later),
(we could work from just barkary, as moodary is just weighed from it),
uses both to make effectively is a mood-colored 1000x24px bark-spectrogram image
'''
from PIL import Image
spe = Image.new('RGB',(1000,24) )
# color it like the standard moodbar, one color for each time interval.
for i in range(0,1000):
lo, mid, hi = moodary[i][0], moodary[i][1], moodary[i][2] # could be weighted from barkary
for bark in range(24):
val = barkary[i][bark]
fval = float(val)/256.
spe.putpixel( (i,23-bark), #23- puts low freqs at bottom, not top
(int(lo*fval),int(hi*fval),int(mid*fval)) )
# note this deviates from moodbar: here blue is mid, green is high
return spe
style_ascii = ' .-~=#'
style_ascii = ' .oO'
style_vertical = u' \u2581\u2582\u2583\u2584\u2585\u2586\u2587\u2589'
style_shades = u' \u2591\u2592\u2593\u2589'
def mood_text(filename, width=78, color=True, truecolor=False, style=style_vertical):
''' RGB-colored, Unicode-block-characters-using line of terminal goodness '''
import helpers_shellcolor as sc # TODO: deal with absence
im = mood_image(filename, height=1, width=width)
def char_for_v255(i, charset=style):
f = float(i)/256.
l = len(charset)
i = f*l
return charset[int(i)]
ret = []
for x in range(im.size[0]):
r,g,b = im.getpixel( (x,0) )
l = 0.3*(r+g+b)
c = char_for_v255(l)
# CONSIDER: consider saturation, not just 'is 1 more than the others'
if truecolor:
ret.append( sc.true_colf(c, r,g,b) )
elif color:
ret.append( sc.closest_from_rgb255(r,g,b, mid=150)(c) )
else:
ret.append( c )
return ''.join(ret)