Difference between revisions of "ISFDB:Data Entropy"

From ISFDB
Jump to navigation Jump to search
(→‎Progress: Added April data)
 
(26 intermediate revisions by 2 users not shown)
Line 9: Line 9:
 
| colspan="3" | Publications w/Pages
 
| colspan="3" | Publications w/Pages
 
| colspan="3" | Publications w/Prices
 
| colspan="3" | Publications w/Prices
 +
| colspan="4" | Verifications
 
|-
 
|-
  
Line 22: Line 23:
 
| colspan="1" | Good
 
| colspan="1" | Good
 
| colspan="1" | % Bad
 
| colspan="1" | % Bad
 +
| colspan="1" | Primary
 +
| colspan="1" | % Primary
 +
| colspan="1" | Any Kind
 +
| colspan="1" | % Any Kind
 
|-
 
|-
  
Line 35: Line 40:
 
| colspan="1" | 74884
 
| colspan="1" | 74884
 
| colspan="1" | 13.5
 
| colspan="1" | 13.5
 +
| colspan="4" |
 
|-
 
|-
  
Line 48: Line 54:
 
| colspan="1" | 75958
 
| colspan="1" | 75958
 
| colspan="1" | 13.8
 
| colspan="1" | 13.8
 +
| colspan="4" |
 
|-
 
|-
  
Line 61: Line 68:
 
| colspan="1" | 79135
 
| colspan="1" | 79135
 
| colspan="1" | 13.6
 
| colspan="1" | 13.6
 +
| colspan="4" |
 
|-
 
|-
  
Line 74: Line 82:
 
| colspan="1" | 82835
 
| colspan="1" | 82835
 
| colspan="1" | 13.2
 
| colspan="1" | 13.2
 +
| colspan="4" |
 
|-
 
|-
  
Line 87: Line 96:
 
| colspan="1" | 85481
 
| colspan="1" | 85481
 
| colspan="1" | 13.0
 
| colspan="1" | 13.0
 +
| colspan="4" |
 
|-
 
|-
  
Line 100: Line 110:
 
| colspan="1" | 90513
 
| colspan="1" | 90513
 
| colspan="1" | 12.6
 
| colspan="1" | 12.6
 +
| colspan="4" |
 
|-
 
|-
  
Line 113: Line 124:
 
| colspan="1" | 96196
 
| colspan="1" | 96196
 
| colspan="1" | 12.0
 
| colspan="1" | 12.0
 +
| colspan="4" |
 
|-
 
|-
  
Line 126: Line 138:
 
| colspan="1" | 97749
 
| colspan="1" | 97749
 
| colspan="1" | 12.1
 
| colspan="1" | 12.1
 +
| colspan="4" |
 
|-
 
|-
  
Line 139: Line 152:
 
| colspan="1" | 99747
 
| colspan="1" | 99747
 
| colspan="1" | 12.0
 
| colspan="1" | 12.0
 +
| colspan="4" |
 
|-
 
|-
  
Line 152: Line 166:
 
| colspan="1" | 100449
 
| colspan="1" | 100449
 
| colspan="1" | 12.1
 
| colspan="1" | 12.1
 +
| colspan="4" |
 
|-
 
|-
  
Line 165: Line 180:
 
| colspan="1" | 100771
 
| colspan="1" | 100771
 
| colspan="1" | 12.1
 
| colspan="1" | 12.1
 +
| colspan="4" |
 
|-
 
|-
  
Line 178: Line 194:
 
| colspan="1" | 101451
 
| colspan="1" | 101451
 
| colspan="1" | 12.1
 
| colspan="1" | 12.1
 +
| colspan="4" |
 
|-
 
|-
  
Line 191: Line 208:
 
| colspan="1" | 101734
 
| colspan="1" | 101734
 
| colspan="1" | 12.1
 
| colspan="1" | 12.1
 +
| colspan="4" |
 
|-
 
|-
  
Line 204: Line 222:
 
| colspan="1" | 101964
 
| colspan="1" | 101964
 
| colspan="1" | 12.1
 
| colspan="1" | 12.1
 +
| colspan="4" |
 
|-
 
|-
  
Line 217: Line 236:
 
| colspan="1" | 103187
 
| colspan="1" | 103187
 
| colspan="1" | 12.0
 
| colspan="1" | 12.0
 +
| colspan="4" |
 
|-
 
|-
  
Line 230: Line 250:
 
| colspan="1" | 104347
 
| colspan="1" | 104347
 
| colspan="1" | 12.0
 
| colspan="1" | 12.0
 +
| colspan="4" |
 +
|-
 +
 +
|- align="center"
 +
| 2008-04-14
 +
| colspan="1" | 46446
 +
| colspan="1" | 74347
 +
| colspan="1" | 38.5
 +
| colspan="1" | 17236
 +
| colspan="1" | 94745
 +
| colspan="1" | 15.4
 +
| colspan="1" | 14404
 +
| colspan="1" | 105073
 +
| colspan="1" | 12.1
 +
| colspan="4" |
 +
 +
|- align="center"
 +
| 2008-04-28
 +
| colspan="1" | 46321
 +
| colspan="1" | 75369
 +
| colspan="1" | 38.2
 +
| colspan="1" | 17134
 +
| colspan="1" | 96324
 +
| colspan="1" | 15.1
 +
| colspan="1" | 14577
 +
| colspan="1" | 106316
 +
| colspan="1" | 12.1
 +
| colspan="4" |
 +
 +
|- align="center"
 +
| 2008-05-15
 +
| colspan="1" | 45710
 +
| colspan="1" | 77627
 +
| colspan="1" | 37.1
 +
| colspan="1" | 16846
 +
| colspan="1" | 97913
 +
| colspan="1" | 14.7
 +
| colspan="1" | 14736
 +
| colspan="1" | 107396
 +
| colspan="1" | 12.1
 +
| colspan="4" |
 +
 +
|- align="center"
 +
| 2008-05-25
 +
| colspan="1" | 45188
 +
| colspan="1" | 79407
 +
| colspan="1" | 36.3
 +
| colspan="1" | 16752
 +
| colspan="1" | 98929
 +
| colspan="1" | 14.5
 +
| colspan="1" | 14819
 +
| colspan="1" | 108257
 +
| colspan="1" | 12.0
 +
| colspan="4" |
 +
|-
 +
 +
|- align="center"
 +
| 2008-07-09
 +
| colspan="1" | 43063
 +
| colspan="1" | 86647
 +
| colspan="1" | 33.2
 +
| colspan="1" | 15367
 +
| colspan="1" | 101577
 +
| colspan="1" | 13.1
 +
| colspan="1" | 14861
 +
| colspan="1" | 109132
 +
| colspan="1" | 12.0
 +
| colspan="4" |
 +
|-
 +
 +
|- align="center"
 +
| 2008-07-26
 +
| colspan="1" | 42058
 +
| colspan="1" | 89751
 +
| colspan="1" | 31.9
 +
| colspan="1" | 15092
 +
| colspan="1" | 102692
 +
| colspan="1" | 12.8
 +
| colspan="1" | 14959
 +
| colspan="1" | 109847
 +
| colspan="1" | 12.0
 +
| colspan="4" |
 +
|-
 +
 +
|- align="center"
 +
| 2008-08-10
 +
| colspan="1" | 41784
 +
| colspan="1" | 91044
 +
| colspan="1" | 31.5
 +
| colspan="1" | 15032
 +
| colspan="1" | 103948
 +
| colspan="1" | 12.6
 +
| colspan="1" | 15020
 +
| colspan="1" | 110920
 +
| colspan="1" | 11.9
 +
| colspan="4" |
 +
|-
 +
 +
|- align="center"
 +
| 2008-09-12
 +
| colspan="1" | 41008
 +
| colspan="1" | 94182
 +
| colspan="1" | 30.3
 +
| colspan="1" | 14956
 +
| colspan="1" | 105614
 +
| colspan="1" | 12.4
 +
| colspan="1" | 15331
 +
| colspan="1" | 112352
 +
| colspan="1" | 12.0
 +
| colspan="4" |
 +
|-
 +
 +
|- align="center"
 +
| 2008-10-01
 +
| colspan="1" | 40448
 +
| colspan="1" | 95801
 +
| colspan="1" | 29.7
 +
| colspan="1" | 14817
 +
| colspan="1" | 106769
 +
| colspan="1" | 12.2
 +
| colspan="1" | 15491
 +
| colspan="1" | 113246
 +
| colspan="1" | 12.0
 +
| colspan="4" |
 +
|-
 +
 +
|- align="center"
 +
| 2008-11-03
 +
| colspan="1" | 39824
 +
| colspan="1" | 100168
 +
| colspan="1" | 28.4
 +
| colspan="1" | 14663
 +
| colspan="1" | 108616
 +
| colspan="1" | 11.9
 +
| colspan="1" | 15646
 +
| colspan="1" | 114943
 +
| colspan="1" | 12.0
 +
| colspan="4" |
 +
|-
 +
 +
|- align="center"
 +
| 2008-12-02
 +
| colspan="1" | 39794
 +
| colspan="1" | 103343
 +
| colspan="1" | 27.8
 +
| colspan="1" | 14525
 +
| colspan="1" | 110050
 +
| colspan="1" | 11.7
 +
| colspan="1" | 15763
 +
| colspan="1" | 116088
 +
| colspan="1" | 12.0
 +
| colspan="4" |
 +
|-
 +
 +
|- align="center"
 +
| 2009-01-05
 +
| colspan="1" | 39623
 +
| colspan="1" | 106075
 +
| colspan="1" | 27.2
 +
| colspan="1" | 14249
 +
| colspan="1" | 112460
 +
| colspan="1" | 11.2
 +
| colspan="1" | 15999
 +
| colspan="1" | 117909
 +
| colspan="1" | 11.9
 +
| colspan="4" |
 +
|-
 +
 +
|- align="center"
 +
| 2009-02-10
 +
| colspan="1" | 38783
 +
| colspan="1" | 110253
 +
| colspan="1" | 26.0
 +
| colspan="1" | 13878
 +
| colspan="1" | 115513
 +
| colspan="1" | 10.7
 +
| colspan="1" | 16205
 +
| colspan="1" | 120558
 +
| colspan="1" | 11.8
 +
| colspan="4" |
 +
|-
 +
 +
|- align="center"
 +
| 2009-03-11
 +
| colspan="1" | 38001
 +
| colspan="1" | 113965
 +
| colspan="1" | 25.0
 +
| colspan="1" | 13456
 +
| colspan="1" | 117207
 +
| colspan="1" | 10.3
 +
| colspan="1" | 16209
 +
| colspan="1" | 121836
 +
| colspan="1" | 11.7
 +
| colspan="4" |
 +
|-
 +
 +
|- align="center"
 +
| 2009-04-09
 +
| colspan="1" | 37639
 +
| colspan="1" | 116218
 +
| colspan="1" | 24.5
 +
| colspan="1" | 13124
 +
| colspan="1" | 119319
 +
| colspan="1" | 9.9
 +
| colspan="1" | 16595
 +
| colspan="1" | 123429
 +
| colspan="1" | 11.9
 +
| colspan="1" | 27455
 +
| colspan="1" | 19.6
 +
| colspan="1" | 31673
 +
| colspan="1" | 22.6
 +
|-
 +
 +
|- align="center"
 +
| 2012-06-16
 +
| colspan="1" | 37924
 +
| colspan="1" | 260468
 +
| colspan="1" | 14.6
 +
| colspan="1" | 8352
 +
| colspan="1" | 209504
 +
| colspan="1" | 4.0
 +
| colspan="1" | 33658
 +
| colspan="1" | 202432
 +
| colspan="1" | 16.6
 +
| colspan="1" | 61756
 +
| colspan="1" | 26.2
 +
| colspan="1" | 95200
 +
| colspan="1" | 40.3
 +
|-
 +
 +
|- align="center"
 +
| 2013-03-09
 +
| colspan="1" | 37163
 +
| colspan="1" | 285343
 +
| colspan="1" | 13.0
 +
| colspan="1" | 7916
 +
| colspan="1" | 228079
 +
| colspan="1" | 3.5
 +
| colspan="1" | 36617
 +
| colspan="1" | 220014
 +
| colspan="1" | 16.6
 +
| colspan="1" | 73028
 +
| colspan="1" | 28.5
 +
| colspan="1" | 107353
 +
| colspan="1" | 41.8
 +
|-
 +
|}
 +
 +
== Total verifications over time ==
 +
 +
The initial scripts counted multiple verifications per pub, so couldn't be used to generate percentages.
 +
 +
{| border="1" cellspacing="1" cellpadding="2" bgcolor="#FFFFD0"
 +
 +
|- align="center"
 +
| colspan="1" | Backup Date
 +
| colspan="2" | Verifications
 +
|-
 +
 +
|- align="center"
 +
|
 +
| colspan="1" | Primary
 +
| colspan="1" | Any Kind
 +
|-
 +
|- align="center"
 +
| colspan="1" | 2008-07-09
 +
| colspan="1" | 18609
 +
| colspan="1" | 27420
 +
|-
 +
 +
|- align="center"
 +
| colspan="1" | 2008-07-26
 +
| colspan="1" | 19109
 +
| colspan="1" | 27986
 +
|-
 +
 +
|- align="center"
 +
| colspan="1" | 2008-08-10
 +
| colspan="1" | 19503
 +
| colspan="1" | 28473
 +
|-
 +
 +
|- align="center"
 +
| colspan="1" | 2008-09-12
 +
| colspan="1" | 20517
 +
| colspan="1" | 29698
 +
|-
 +
 +
|- align="center"
 +
| colspan="1" | 2008-10-01
 +
| colspan="1" | 21091
 +
| colspan="1" | 30408
 +
|-
 +
 +
|- align="center"
 +
| colspan="1" | 2008-11-03
 +
| colspan="1" | 22643
 +
| colspan="1" | 32201
 +
|-
 +
 +
|- align="center"
 +
| colspan="1" | 2008-12-02
 +
| colspan="1" | 23806
 +
| colspan="1" | 33930
 +
|-
 +
 +
|- align="center"
 +
| colspan="1" | 2009-01-05
 +
| colspan="1" | 25061
 +
| colspan="1" | 35644
 +
|-
 +
 +
|- align="center"
 +
| colspan="1" | 2009-02-10
 +
| colspan="1" | 27122
 +
| colspan="1" | 39908
 +
|-
 +
 +
|- align="center"
 +
| colspan="1" | 2009-03-11
 +
| colspan="1" | 29350
 +
| colspan="1" | 50177
 +
|-
 +
 +
|- align="center"
 +
| colspan="1" | 2009-04-09
 +
| colspan="1" | 31083
 +
| colspan="1" | 60037
 
|-
 
|-
  
Line 301: Line 649:
 
  from pubs p
 
  from pubs p
 
  group by case ifnull(pub_price, 'L') when 'L' then "Bad" else "Good" end
 
  group by case ifnull(pub_price, 'L') when 'L' then "Bad" else "Good" end
 +
 +
Verifications (corrected):
 +
select 'Primary Verified', count(DISTINCT p.pub_id)
 +
from pubs p, verification v
 +
where v.pub_id = p.pub_id
 +
and (v.reference_id = 1 or v.reference_id = 12)
 +
UNION
 +
select 'Verified', count(DISTINCT p.pub_id)
 +
from pubs p, verification v
 +
where v.pub_id = p.pub_id
 +
UNION
 +
select 'Total Pubs', count(*)
 +
from pubs p

Latest revision as of 07:36, 27 March 2013

Progress

The percentages are for bad/missing data and should ideally continually be decreasing. The total number of magazine pages and publications should generally be increasing as more are published, but may dip if we actively work on duplicate removal for a while. Other clean-up projects may cause blips: e.g. adding publications that appeared only in reviews has led to a slight increase in publications lacking prices as it's difficult to find this data for older publications from online sources alone.

Backup Date Magazine Pages Publications w/Pages Publications w/Prices Verifications
Bad Good % Bad Bad Good % Bad Bad Good % Bad Primary % Primary Any Kind % Any Kind
2006-09-28 68322 2035 97.1 21889 55116 28.4 11730 74884 13.5
2006-12-17 65277 6883 90.5 21662 57122 27.5 12111 75958 13.8
2007-02-18 64693 12240 84.1 21499 60864 26.1 12509 79135 13.6
2007-04-15 64016 15957 80.0 21062 65376 24.4 12611 82835 13.2
2007-06-12 62676 22479 73.6 20738 68877 23.1 12828 85481 13.0
2007-08-11 59852 33603 64.0 20409 74829 21.4 13023 90513 12.6
2007-11-08 53750 48353 52.6 19535 82131 19.2 13118 96196 12.0
2007-12-09 52599 52163 50.0 19502 84153 18.8 13468 97749 12.1
2007-12-29 50814 58049 46.6 19189 86626 18.1 13659 99747 12.0
2008-01-07 50108 60482 45.3 19140 87695 17.9 13852 100449 12.1
2008-01-14 49913 61442 44.8 19130 88133 17.8 13921 100771 12.1
2008-01-28 49481 62651 44.1 18845 89176 17.4 13972 101451 12.1
2008-02-04 49013 63802 43.5 18689 89658 17.0 13977 101734 12.1
2008-02-11 48548 65301 42.6 18520 90118 17.0 14020 101964 12.1
2008-03-01 47531 68888 40.8 18187 91785 16.5 14087 103187 12.0
2008-04-06 46449 73894 38.6 17337 93842 15.6 14270 104347 12.0
2008-04-14 46446 74347 38.5 17236 94745 15.4 14404 105073 12.1
2008-04-28 46321 75369 38.2 17134 96324 15.1 14577 106316 12.1
2008-05-15 45710 77627 37.1 16846 97913 14.7 14736 107396 12.1
2008-05-25 45188 79407 36.3 16752 98929 14.5 14819 108257 12.0
2008-07-09 43063 86647 33.2 15367 101577 13.1 14861 109132 12.0
2008-07-26 42058 89751 31.9 15092 102692 12.8 14959 109847 12.0
2008-08-10 41784 91044 31.5 15032 103948 12.6 15020 110920 11.9
2008-09-12 41008 94182 30.3 14956 105614 12.4 15331 112352 12.0
2008-10-01 40448 95801 29.7 14817 106769 12.2 15491 113246 12.0
2008-11-03 39824 100168 28.4 14663 108616 11.9 15646 114943 12.0
2008-12-02 39794 103343 27.8 14525 110050 11.7 15763 116088 12.0
2009-01-05 39623 106075 27.2 14249 112460 11.2 15999 117909 11.9
2009-02-10 38783 110253 26.0 13878 115513 10.7 16205 120558 11.8
2009-03-11 38001 113965 25.0 13456 117207 10.3 16209 121836 11.7
2009-04-09 37639 116218 24.5 13124 119319 9.9 16595 123429 11.9 27455 19.6 31673 22.6
2012-06-16 37924 260468 14.6 8352 209504 4.0 33658 202432 16.6 61756 26.2 95200 40.3
2013-03-09 37163 285343 13.0 7916 228079 3.5 36617 220014 16.6 73028 28.5 107353 41.8

Total verifications over time

The initial scripts counted multiple verifications per pub, so couldn't be used to generate percentages.

Backup Date Verifications
Primary Any Kind
2008-07-09 18609 27420
2008-07-26 19109 27986
2008-08-10 19503 28473
2008-09-12 20517 29698
2008-10-01 21091 30408
2008-11-03 22643 32201
2008-12-02 23806 33930
2009-01-05 25061 35644
2009-02-10 27122 39908
2009-03-11 29350 50177
2009-04-09 31083 60037

Prices

Backup Date Total pubs Pubs without a price % of pubs without a price Bad price % of pubs with a bad price US price New UK price Old UK price Canadian price Australian price
2008-01-20 115043 14166 12.3% 1027 0.89% 80540 17356 1583 145 214

SQL scripts Used

These are rough scripts and could generally all be improved: however, the results above were generated using these so any changes/improvements invalidate (even if only partially) the figures gathered so far.

Magazine Pages with or without page numbers:

select case ifnull(pubc_page, -999) when -999 then "Bad" else "Good" end, count(*)
from pub_content pc, pubs p, titles t
where p.pub_id = pc.pub_id
and p.pub_ctype = 'MAGAZINE'
and t.title_id = pc.title_id
and t.title_ttype NOT IN ('EDITOR','COVERART')
group by case ifnull(pubc_page, -999) when -999 then "Bad" else "Good" end

Publications with or without total page count:

select case ifnull(pub_pages, -999) when -999 then "Bad" else "Good" end, count(*)
from pubs p
WHERE p.pub_ptype NOT LIKE '%audio%'  And p.pub_ptype NOT LIKE '%cassette%'
And p.pub_ptype NOT LIKE 'CD%'        And p.pub_ptype NOT LIKE 'compact disc%'
And p.pub_ptype NOT LIKE 'e%book%'    And p.pub_ptype NOT LIKE 'electron%'
And p.pub_ptype NOT LIKE '%web%'      And p.pub_ptype NOT LIKE '%ezine%'
And p.pub_ptype NOT LIKE '%internet%' And p.pub_ptype NOT LIKE '%mp3%'
And p.pub_ptype NOT LIKE '%Adobe%'    And p.pub_ptype NOT LIKE '%Mobipocket%'
And p.pub_ptype NOT LIKE '%PDF%'      And p.pub_ptype NOT LIKE '%tape%'
And p.pub_ptype NOT LIKE '%www%'      And p.pub_ptype NOT LIKE '%digit%'
And p.pub_ptype NOT LIKE '%online%'
group by case ifnull(pub_pages, -999) when -999 then "Bad" else "Good" end

Publications with or without prices:

select case ifnull(pub_price, 'L') when 'L' then "Bad" else "Good" end, count(*)
from pubs p
group by case ifnull(pub_price, 'L') when 'L' then "Bad" else "Good" end

Verifications (corrected):

select 'Primary Verified', count(DISTINCT p.pub_id)
from pubs p, verification v
where v.pub_id = p.pub_id
and (v.reference_id = 1 or v.reference_id = 12)
UNION
select 'Verified', count(DISTINCT p.pub_id)
from pubs p, verification v
where v.pub_id = p.pub_id
UNION
select 'Total Pubs', count(*)
from pubs p