| Author | Topics » Book an abo for this thread |  |
VerGreeneyes Strong supporter
 

Status:Offline Date registered: 26.04.2007 Post:89 Send Message | Created on 09.05.2007 - 16:25 |  |
Damn, I knew it was too good to be true.. looks like I messed something up again, because some pixels have the wrong colour. Fixing this will definitely hurt performance..
|
|
|
guest  Real addict
  

Status:Offline Date registered: 30.07.2004 Post:856 Send Message | Created on 09.05.2007 - 16:38 |  |
One optimisation opens up another...
Generaly looking, the port got 2xfaster as it was initially (i.e. from 35 to 70 avg. FPS on a x1650pro/1280x1024 - for sprite games like lunar).
Ver: i figured out some colors can be ignored (D3, D6) in my later versions. Sure u can do the same.
|
VerGreeneyes Strong supporter
 

Status:Offline Date registered: 26.04.2007 Post:89 Send Message | Created on 09.05.2007 - 16:56 |  |
Yep, I noticed ^_^
Might I suggest this code for assigning the values? It's somewhat simpler and a bit faster on both my cards: | Code: | | 1: | | 2: | | 3: | | 4: | | 5: | | 6: | | 7: | | 8: | | 9: | | 10: | | 11: | | 12: | | 13: | | 14: | | 15: | | 16: | | 17: | | 18: | | 19: | | 20: | | 21: | | 22: | | 23: | | 24: | | | | vec2 OGL2Pos = gl_TexCoord[0].xy*OGL2Size.xy,
| fp = fract(OGL2Pos),
| dx = vec2( OGL2InvSize.x, 0.0),
| dy = vec2( 0.0, OGL2InvSize.y),
| g1 = vec2( OGL2InvSize.x,OGL2InvSize.y),
| g2 = vec2(-OGL2InvSize.x,OGL2InvSize.y);
| if(fp.x >= .5 && fp.y < .5) g2*=-1.0;
| vec2 g3 = .5*(g1+g2), g4 = .5*(g1-g2),
| pC4 = floor(OGL2Pos)/OGL2Size.xy;
| vec3 C0 = texture2D(OGL2Texture,pC4-g1 ).xyz,
| C1 = texture2D(OGL2Texture,pC4-g3 ).xyz,
| C2 = texture2D(OGL2Texture,pC4-g2 ).xyz,
| C3 = texture2D(OGL2Texture,pC4-g4 ).xyz,
| C4 = texture2D(OGL2Texture,pC4 ).xyz,
| C5 = texture2D(OGL2Texture,pC4+g4 ).xyz,
| C6 = texture2D(OGL2Texture,pC4+g2 ).xyz,
| C7 = texture2D(OGL2Texture,pC4+g3 ).xyz,
| C8 = texture2D(OGL2Texture,pC4+g1 ).xyz,
| D0 = texture2D(OGL2Texture,pC4+g2+g3).xyz,
| D1 = texture2D(OGL2Texture,pC4+g1+g2).xyz,
| D2 = texture2D(OGL2Texture,pC4+g1+g3).xyz,
| D4 = texture2D(OGL2Texture,pC4+g1-g2).xyz,
| D5 = texture2D(OGL2Texture,pC4+g1+g4).xyz,
| | p10,p11; | |
[Dieser Beitrag wurde am 09.05.2007 - 17:08 von VerGreeneyes aktualisiert]
|
guest  Real addict
  

Status:Offline Date registered: 30.07.2004 Post:856 Send Message | Created on 09.05.2007 - 18:23 |  |
OK., i figured how it works faster too... 
| Code: | | 1: | | 2: | | 3: | | 4: | | 5: | | 6: | | 7: | | 8: | | 9: | | 10: | | 11: | | 12: | | 13: | | 14: | | 15: | | 16: | | 17: | | 18: | | 19: | | 20: | | 21: | | 22: | | 23: | | 24: | | 25: | | 26: | | 27: | | 28: | | 29: | | 30: | | 31: | | 32: | | 33: | | 34: | | 35: | | 36: | | 37: | | 38: | | 39: | | 40: | | 41: | | 42: | | 43: | | 44: | | 45: | | 46: | | 47: | | 48: | | 49: | | 50: | | 51: | | 52: | | 53: | | 54: | | 55: | | 56: | | 57: | | 58: | | 59: | | 60: | | 61: | | 62: | | 63: | | 64: | | 65: | | 66: | | 67: | | 68: | | 69: | | 70: | | 71: | | 72: | | 73: | | 74: | | 75: | | 76: | | 77: | | 78: | | 79: | | 80: | | 81: | | 82: | | 83: | | 84: | | 85: | | 86: | | 87: | | 88: | | 89: | | 90: | | 91: | | 92: | | 93: | | 94: | | 95: | | 96: | | 97: | | 98: | | 99: | | 100: | | 101: | | 102: | | 103: | | 104: | | 105: | | 106: | | 107: | | 108: | | 109: | | 110: | | 111: | | 112: | | 113: | | 114: | | 115: | | 116: | | 117: | | 118: | | 119: | | 120: | | 121: | | 122: | | 123: | | 124: | | 125: | | 126: | | 127: | | 128: | | 129: | | 130: | | 131: | | 132: | | 133: | | 134: | | 135: | | 136: | | 137: | | 138: | | 139: | | 140: | | 141: | | 142: | | 143: | | | |
| /*
| 2xSaI GLSL shader
|
| - Copyright (C) 2007 guest(r) - guest.r@gmail.com
|
| - License: GNU-GPL
|
|
| The 2xSaI algorithm
|
| - Copyright (c) 1999-2001 by Derek Liauw Kie Fa.
|
| */
|
|
| const vec3 dt = vec3(65536.0,256.0,1.0);
|
|
|
| float GET_RESULT(float A, float B, float C, float D)
| {
| return (sign(abs(A-C)+abs(A-D)) - sign(abs(B-C)+abs(B-D)));
| }
|
|
| float reduce(vec3 color)
| {
| return dot(color,dt);
| }
|
|
| uniform vec4 OGL2Size;
| uniform vec4 OGL2InvSize;
| uniform sampler2D OGL2Texture;
|
| void main()
| {
|
| // Calculating texel coordinates
|
| vec2 OGL2Pos = gl_TexCoord[0].xy*OGL2Size.xy;
| vec2 fp = fract(OGL2Pos);
| vec2 g1 = vec2( OGL2InvSize.x,OGL2InvSize.y);
| vec2 g2 = vec2(-OGL2InvSize.x,OGL2InvSize.y);
|
| if (fp.x >= 0.50 && fp.y < 0.50) g2*=-1.0;
|
| vec2 pC4 = floor(OGL2Pos)/OGL2Size.xy;
| vec2 pC8 = pC4 + g1;
| vec2 pC0 = pC4 - g1;
|
| vec2 p04 = pC4 - 0.5*g1;
| vec2 pC3 = p04 + 0.5*g2;
| vec2 pC1 = pC3 - g2;
| vec2 pC5 = pC1 + g1;
| vec2 pC7 = pC3 + g1;
|
|
| // Reading the texels
|
| vec3 C0 = texture2D(OGL2Texture,pC0 ).xyz;
| vec3 C1 = texture2D(OGL2Texture,pC1 ).xyz;
| vec3 C2 = texture2D(OGL2Texture,pC4-g2).xyz;
| vec3 C3 = texture2D(OGL2Texture,pC3 ).xyz;
| vec3 C4 = texture2D(OGL2Texture,pC4 ).xyz;
| vec3 C5 = texture2D(OGL2Texture,pC5 ).xyz;
| vec3 D4 = texture2D(OGL2Texture,pC8-g2).xyz;
| vec3 C6 = texture2D(OGL2Texture,pC4+g2).xyz;
| vec3 C7 = texture2D(OGL2Texture,pC7 ).xyz;
| vec3 C8 = texture2D(OGL2Texture,pC8 ).xyz;
| vec3 D5 = texture2D(OGL2Texture,pC5+g1).xyz;
| vec3 D0 = texture2D(OGL2Texture,pC7+g2).xyz;
| vec3 D1 = texture2D(OGL2Texture,pC8+g2).xyz;
| vec3 D2 = texture2D(OGL2Texture,pC7+g1).xyz;
| vec3 p10,p11;
|
| float c0 = reduce(C0);float c1 = reduce(C1);
| float c2 = reduce(C2);float c3 = reduce(C3);
| float c4 = reduce(C4);float c5 = reduce(C5);
| float c6 = reduce(C6);float c7 = reduce(C7);
| float c8 = reduce(C8);float d0 = reduce(D0);
| float d1 = reduce(D1);float d2 = reduce(D2);
| float d4 = reduce(D4);float d5 = reduce(D5);
|
|
| /* SaI code */
| /* Copied from the Dosbox source code */
| /* Copyright (C) 2002-2007 The DOSBox Team */
| /* License: GNU-GPL */
| /* Adapted by guest(r) on 20.4 and 9.5. 2007 */
|
| if (c4 == c8) {
| if (c5 != c7) {
| if (((c4 == c3)&&(c7 == d2))||((c4 == c5)&&(c4 == c6)&&(c3 != c7)&&(c7 == d0))) {
| p10 = C4;
| } else {
| p10 = 0.5*(C4+C7);
| }
| p11 = C4;
| } else {
| if (c4 == c5) {
| p10 = C4;
| p11 = C4;
| } else {
| float r;
| r = GET_RESULT(c4,c5,c3,c1);
| r -= GET_RESULT(c5,c4,d4,c2);
| r -= GET_RESULT(c5,c4,c6,d1);
| r += GET_RESULT(c4,c5,d5,d2);
| if (r > 0.0) p11 = C4;
| else if (r < 0.0) p11 = C5;
| else p11 = 0.25*(C4+C5+C7+C8);
| p10 = 0.5*(C4+C7);
| }
| }
| } else
| if (c5 == c7) {
| if (((c7 == c6)&&(c4 == c2))||((c7 == c3)&&(c7 == c8)&&(c4 != c6)&&(c4 == c0))) {
| p10 = C7;
| } else {
| p10 = 0.5*(C4+C7);
| }
| p11 = C5;
| } else {
| p11 = 0.25*(C4+C5+C7+C8);
|
| if ((c4 == c5)&&(c4 == c6)&&(c3 != c7)&&(c7 == d0)) {
| p10 = C4;
| } else if ((c7 == c3)&&(c7 == c8)&&(c4 != c6)&&(c4 == c0)) {
| p10 = C7;
| } else {
| p10 = 0.5*(C4+C7);
| }
| }
|
| // Distributing the final products
|
| if (fp.x >= 0.5 && fp.y >= 0.5) gl_FragColor.xyz = p11; else
| if (fp.x < 0.5 && fp.y < 0.5) gl_FragColor.xyz = C4; else
| gl_FragColor.xyz = p10;
| }
| | |
Vertex file stays the same...
[Dieser Beitrag wurde am 10.05.2007 - 13:11 von guest aktualisiert]
|
VerGreeneyes Strong supporter
 

Status:Offline Date registered: 26.04.2007 Post:89 Send Message | Created on 09.05.2007 - 21:57 |  |
Wow, that was some optimisation, I get a constant 60fps now! *goes to see what you changed*
Edit: well it took me some time to figure it out with the somewhat messed up spacing, but I see what you did now; nice work!
[Dieser Beitrag wurde am 09.05.2007 - 22:20 von VerGreeneyes aktualisiert]
|
guest  Real addict
  

Status:Offline Date registered: 30.07.2004 Post:856 Send Message | Created on 10.05.2007 - 13:16 |  |
It looks like the SW 2xSaI code has still some speedup potential (SuperEagle is optimised already).
I'll upload the shader again since the branching levels were really too well disguised.
I almost forgot...
When i first used the code to calculate p01 as the "major candidate", some colors looked different.
Dunno if it's an older, newer bug, scaler feature...
When i turned things around (and used the p10 code) i didn't notice any differences between full and adapted algorithms.
|
VerGreeneyes Strong supporter
 

Status:Offline Date registered: 26.04.2007 Post:89 Send Message | Created on 10.05.2007 - 14:23 |  |
Well, barring further optimisations, here's my final code (vertex file unchanged):
| Code: | | 1: | | 2: | | 3: | | 4: | | 5: | | 6: | | 7: | | 8: | | 9: | | 10: | | 11: | | 12: | | 13: | | 14: | | 15: | | 16: | | 17: | | 18: | | 19: | | 20: | | 21: | | 22: | | 23: | | 24: | | 25: | | 26: | | 27: | | 28: | | 29: | | 30: | | 31: | | 32: | | 33: | | 34: | | 35: | | 36: | | 37: | | 38: | | 39: | | 40: | | 41: | | 42: | | 43: | | 44: | | 45: | | 46: | | 47: | | 48: | | 49: | | 50: | | 51: | | 52: | | 53: | | 54: | | 55: | | 56: | | 57: | | 58: | | 59: | | 60: | | 61: | | 62: | | 63: | | 64: | | 65: | | 66: | | 67: | | 68: | | 69: | | 70: | | 71: | | 72: | | 73: | | 74: | | 75: | | 76: | | 77: | | 78: | | 79: | | 80: | | 81: | | 82: | | 83: | | 84: | | 85: | | 86: | | 87: | | 88: | | 89: | | 90: | | 91: | | 92: | | 93: | | 94: | | 95: | | 96: | | 97: | | 98: | | 99: | | 100: | | 101: | | 102: | | 103: | | 104: | | 105: | | 106: | | 107: | | | | /*
| 2xSaI GLSL shader
|
| - Copyright (C) 2007 guest(r) - guest.r@gmail.com
|
| - License: GNU-GPL
|
| The 2xSaI algorithm
|
| - Copyright (c) 1999-2001 by Derek Liauw Kie Fa.
| */
|
| #define reduce(color)(dot(color,dt))
|
| const vec3 dt = vec3(65536.,256.,1.);
| uniform vec4 OGL2Size, OGL2InvSize;
| uniform sampler2D OGL2Texture;
|
| void main()
| {
|
| // Calculating texel coordinates
|
| vec2 OGL2Pos = gl_TexCoord[0].xy*OGL2Size.xy,
| fp = fract(OGL2Pos),
| g1 = vec2( OGL2InvSize.x,OGL2InvSize.y),
| g2 = vec2(-OGL2InvSize.x,OGL2InvSize.y);
|
| if(fp.x >= .5 && fp.y < .5) g2=-g2;
|
| vec2 pC4 = floor(OGL2Pos)/OGL2Size.xy,
| g3 = .5*(g1-g2), pC8 = pC4+g1;
|
|
| // Reading the texels
|
| vec3 C0 = texture2D(OGL2Texture,pC4-g1 ).xyz,
| C1 = texture2D(OGL2Texture,pC4-g2-g3 ).xyz,
| C2 = texture2D(OGL2Texture,pC4-g2 ).xyz,
| C3 = texture2D(OGL2Texture,pC4-g3 ).xyz,
| C4 = texture2D(OGL2Texture,pC4 ).xyz,
| C5 = texture2D(OGL2Texture,pC4+g3 ).xyz,
| C6 = texture2D(OGL2Texture,pC4+g2 ).xyz,
| C7 = texture2D(OGL2Texture,pC8-g3 ).xyz,
| C8 = texture2D(OGL2Texture,pC8 ).xyz,
| D0 = texture2D(OGL2Texture,pC4+2.*g2+g3).xyz,
| D1 = texture2D(OGL2Texture,pC8+g2 ).xyz,
| D2 = texture2D(OGL2Texture,pC8+g1-g3 ).xyz,
| D4 = texture2D(OGL2Texture,pC8-g2 ).xyz,
| D5 = texture2D(OGL2Texture,pC8+g3 ).xyz,
| p10,p11;
|
| float c0 = reduce(C0),c1 = reduce(C1),c2 = reduce(C2),c3 = reduce(C3),
| c4 = reduce(C4),c5 = reduce(C5),c6 = reduce(C6),c7 = reduce(C7),
| c8 = reduce(C8),d0 = reduce(D0),d1 = reduce(D1),d2 = reduce(D2),
| d4 = reduce(D4),d5 = reduce(D5);
|
|
| /* SaI code */
| /* Copied from the Dosbox source code */
| /* Copyright (C) 2002-2007 The DOSBox Team */
| /* License: GNU-GPL */
| /* Adapted by guest(r) on 20.4 and 9.5. 2007 */
|
| if(c4 == c8)
| {
| if(c5 != c7)
| {
| p10 = (c4 == c3 && c7 == d2 || c4 == c5 && c4 == c6 && c3 != c7 && c7 == d0) ? C4 : .5*(C4+C7);
| p11 = C4;
| }
| else
| {
| if(c4 == c5) p11 = (p10 = C4);
| else
| {
| float r = sign(abs(c4-c3)+abs(c4-c1))+sign(abs(c4-d4)+abs(c4-c2))
| +sign(abs(c4-c6)+abs(c4-d1))+sign(abs(c4-d5)+abs(c4-d2))
| -sign(abs(c5-c3)+abs(c5-c1))-sign(abs(c5-d4)+abs(c5-c2))
| -sign(abs(c5-c6)+abs(c5-d1))-sign(abs(c5-d5)+abs(c5-d2));
| p10 = .5*(C4+C7);
| if(r > 0.) p11 = C4;
| else if(r < 0.) p11 = C5;
| else p11 = .25*(C4+C5+C7+C8);
| }
| }
| }
| else if(c5 == c7)
| {
| p10 = (c7 == c6 && c4 == c2 || c7 == c3 && c7 == c8 && c4 != c6 && c4 == c0) ? C7 : 0.5*(C4+C7);
| p11 = C5;
| }
| else
| {
| p11 = 0.25*(C4+C5+C7+C8);
| if(c4 == c5 && c4 == c6 && c3 != c7 && c7 == d0) p10 = C4;
| else if(c7 == c3 && c7 == c8 && c4 != c6 && c4 == c0) p10 = C7;
| else p10 = 0.5*(C4+C7);
| }
|
| // Distributing the final products
|
| if(fp.x >= .5 && fp.y >= .5) gl_FragColor.xyz = p11;
| else if(fp.x >= .5 || fp.y >= .5) gl_FragColor.xyz = p10;
| else gl_FragColor.xyz = C4;
| }
| | |
Hope you agree with the coding conventions I used.. this should be 'by the book'. This one gives me a constant ~61fps on my laptop, with vsync + triple buffering disabled (which makes me lose about 5fps). Tested in Chrono Trigger.
Edit: actually, I take that back.. this is all so confusing. Now I'm getting more fps without said combination... Oh well, it's at 60 atleast.
[Dieser Beitrag wurde am 10.05.2007 - 14:27 von VerGreeneyes aktualisiert]
|
guest  Real addict
  

Status:Offline Date registered: 30.07.2004 Post:856 Send Message | Created on 10.05.2007 - 20:03 |  |
A version based on modified diag. color "equalities".
Branching through larger code segments more often consequences a moderate speed reduction.
| Code: | | 1: | | 2: | | 3: | | 4: | | 5: | | 6: | | 7: | | 8: | | 9: | | 10: | | 11: | | 12: | | 13: | | 14: | | 15: | | 16: | | 17: | | 18: | | 19: | | 20: | | 21: | | 22: | | 23: | | 24: | | 25: | | 26: | | 27: | | 28: | | 29: | | 30: | | 31: | | 32: | | 33: | | 34: | | 35: | | 36: | | 37: | | 38: | | 39: | | 40: | | 41: | | 42: | | 43: | | 44: | | 45: | | 46: | | 47: | | 48: | | 49: | | 50: | | 51: | | 52: | | 53: | | 54: | | 55: | | 56: | | 57: | | 58: | | 59: | | 60: | | 61: | | 62: | | 63: | | 64: | | 65: | | 66: | | 67: | | 68: | | 69: | | 70: | | 71: | | 72: | | 73: | | 74: | | 75: | | 76: | | 77: | | 78: | | 79: | | 80: | | 81: | | 82: | | 83: | | 84: | | 85: | | 86: | | 87: | | 88: | | 89: | | 90: | | 91: | | 92: | | 93: | | 94: | | 95: | | 96: | | 97: | | 98: | | 99: | | 100: | | 101: | | 102: | | 103: | | 104: | | 105: | | 106: | | 107: | | 108: | | 109: | | 110: | | 111: | | 112: | | 113: | | 114: | | 115: | | 116: | | 117: | | 118: | | 119: | | 120: | | 121: | | 122: | | | |
| /*
| 2xSaI GLSL shader
|
| - Copyright (C) 2007 guest(r) - guest.r@gmail.com
|
| - License: GNU-GPL
|
| - Enhanced by VerGreeneyes (10.5.2007)
|
| - (Experimental version - modified color "equality"![]()
|
|
|
| The 2xSaI algorithm
|
| - Copyright (c) 1999-2001 by Derek Liauw Kie Fa.
| */
|
| #define reduce(color)(dot(color,dt))
|
| const float th = 0.064; // 2 * 1/2^5 tolerance (2 singletons in RGB555)
|
| const vec3 dt = vec3(65536.,256.,1.);
| const vec3 dp = vec3(1.0);
| uniform vec4 OGL2Size, OGL2InvSize;
| uniform sampler2D OGL2Texture;
|
| void main()
| {
|
| // Calculating texel coordinates
|
| vec2 OGL2Pos = gl_TexCoord[0].xy*OGL2Size.xy,
| fp = fract(OGL2Pos),
| g1 = vec2( OGL2InvSize.x,OGL2InvSize.y),
| g2 = vec2(-OGL2InvSize.x,OGL2InvSize.y);
|
| if(fp.x >= .5 && fp.y < .5) g2=-g2;
|
| vec2 pC4 = floor(OGL2Pos)/OGL2Size.xy,
| g3 = .5*(g1-g2), pC8 = pC4+g1;
|
|
| // Reading the texels
|
| vec3 C0 = texture2D(OGL2Texture,pC4-g1 ).xyz,
| C1 = texture2D(OGL2Texture,pC4-g2-g3 ).xyz,
| C2 = texture2D(OGL2Texture,pC4-g2 ).xyz,
| C3 = texture2D(OGL2Texture,pC4-g3 ).xyz,
| C4 = texture2D(OGL2Texture,pC4 ).xyz,
| C5 = texture2D(OGL2Texture,pC4+g3 ).xyz,
| C6 = texture2D(OGL2Texture,pC4+g2 ).xyz,
| C7 = texture2D(OGL2Texture,pC8-g3 ).xyz,
| C8 = texture2D(OGL2Texture,pC8 ).xyz,
| D0 = texture2D(OGL2Texture,pC4+2.*g2+g3).xyz,
| D1 = texture2D(OGL2Texture,pC8+g2 ).xyz,
| D2 = texture2D(OGL2Texture,pC8+g1-g3 ).xyz,
| D4 = texture2D(OGL2Texture,pC8-g2 ).xyz,
| D5 = texture2D(OGL2Texture,pC8+g3 ).xyz,
| p10,p11;
|
| float c0 = reduce(C0),c1 = reduce(C1),c2 = reduce(C2),c3 = reduce(C3),
| c4 = reduce(C4),c5 = reduce(C5),c6 = reduce(C6),c7 = reduce(C7),
| c8 = reduce(C8),d0 = reduce(D0),d1 = reduce(D1),d2 = reduce(D2),
| d4 = reduce(D4),d5 = reduce(D5);
|
|
| /* SaI code */
| /* Copied from the Dosbox source code */
| /* Copyright (C) 2002-2007 The DOSBox Team */
| /* License: GNU-GPL */
| /* Adapted by guest(r) on 20.4 and 9.5. 2007 */
| /* and VerGreeneyes (10.5.2007) */
|
|
| float dif1 = dot(abs(C4-C8),dp);
| float dif2 = dot(abs(C5-C7),dp);
|
| if(dif1 <= th)
| {
| if(dif2 > th)
| {
| p10 = (c4 == c3 && c7 == d2 || c4 == c5 && c4 == c6 && c3 != c7 && c7 == d0) ? C4 : .5*(C4+C7);
| p11 = C4;
| }
| else
| {
| if(c4 == c5) p11 = (p10 = C4);
| else
| {
| float r = sign(abs(c4-c3)+abs(c4-c1))+sign(abs(c4-d4)+abs(c4-c2))
| +sign(abs(c4-c6)+abs(c4-d1))+sign(abs(c4-d5)+abs(c4-d2))
| -sign(abs(c5-c3)+abs(c5-c1))-sign(abs(c5-d4)+abs(c5-c2))
| -sign(abs(c5-c6)+abs(c5-d1))-sign(abs(c5-d5)+abs(c5-d2));
| p10 = .5*(C4+C7);
| if(r > 0.) p11 = C4;
| else if(r < 0.) p11 = C5;
| else p11 = .25*(C4+C5+C7+C8);
| }
| }
| }
| else if(dif2 <= th)
| {
| p10 = (c7 == c6 && c4 == c2 || c7 == c3 && c7 == c8 && c4 != c6 && c4 == c0) ? C7 : 0.5*(C4+C7);
| p11 = C5;
| }
| else
| {
| p11 = 0.25*(C4+C5+C7+C8);
| if(c4 == c5 && c4 == c6 && c3 != c7 && c7 == d0) p10 = C4;
| else if(c7 == c3 && c7 == c8 && c4 != c6 && c4 == c0) p10 = C7;
| else p10 = 0.5*(C4+C7);
| }
|
| // Distributing the final products
|
| if(fp.x >= .5 && fp.y >= .5) gl_FragColor.xyz = p11;
| else if(fp.x >= .5 || fp.y >= .5) gl_FragColor.xyz = p10;
| else gl_FragColor.xyz = C4;
| }
| | |
|
VerGreeneyes Strong supporter
 

Status:Offline Date registered: 26.04.2007 Post:89 Send Message | Created on 10.05.2007 - 20:35 |  |
I can't see any difference in Chrono Trigger - perhaps I should test a different game. Here's a version I was working on: I included your experimental changes along with some of my own. The GET_RESULT function is back, because I did some testing and I'm sure the previous version didn't behave the same way as the original (although I didn't see any difference, to be fair..); this slows the filter down a bit, but I also made it use vec4 instead of vec3, which speeds it up quite a bit. Found that out on accident. In theory the alpha channel is getting 2xSaId now, so you might want to check a few games to see if everything still looks right. (I don't include it in the calculations though)
Fragment file: | Code: | | 1: | | 2: | | 3: | | 4: | | 5: | | 6: | | 7: | | 8: | | 9: | | 10: | | 11: | | 12: | | 13: | | 14: | | 15: | | 16: | | 17: | | 18: | | 19: | | 20: | | 21: | | 22: | | 23: | | 24: | | 25: | | 26: | | 27: | | 28: | | 29: | | 30: | | 31: | | 32: | | 33: | | 34: | | 35: | | 36: | | 37: | | 38: | | 39: | | 40: | | 41: | | 42: | | 43: | | 44: | | 45: | | 46: | | 47: | | 48: | | 49: | | 50: | | 51: | | 52: | | 53: | | 54: | | 55: | | 56: | | 57: | | 58: | | 59: | | 60: | | 61: | | 62: | | 63: | | 64: | | 65: | | 66: | | 67: | | 68: | | 69: | | 70: | | 71: | | 72: | | 73: | | 74: | | 75: | | 76: | | 77: | | 78: | | 79: | | 80: | | 81: | | 82: | | 83: | | 84: | | 85: | | 86: | | 87: | | 88: | | 89: | | 90: | | 91: | | 92: | | 93: | | 94: | | 95: | | 96: | | 97: | | 98: | | 99: | | 100: | | 101: | | 102: | | 103: | | 104: | | 105: | | 106: | | 107: | | 108: | | 109: | | 110: | | 111: | | 112: | | 113: | | 114: | | 115: | | 116: | | 117: | | 118: | | 119: | | 120: | | 121: | | | |
| /*
| 2xSaI GLSL shader
|
| - Copyright (C) 2007 guest(r) - guest.r@gmail.com
|
| - License: GNU-GPL
|
| - Enhanced by VerGreeneyes (10.5.2007)
|
| - (Experimental version - modified color "equality"
|
|
| The 2xSaI algorithm
|
| - Copyright (c) 1999-2001 by Derek Liauw Kie Fa.
| */
|
| const float th = .0625; // 2 * 1/2^5 tolerance (2 singletons in RGB555)
| const vec4 dt = vec4(65536.,256.,1.,0.);
| const vec4 dp = vec4(1.,1.,1.,0.);
| uniform vec4 OGL2Size, OGL2InvSize;
| uniform sampler2D OGL2Texture;
|
| #define reduce(color)(dot(color,dt))
|
| float GET_RESULT(float A,float B,float C,float D)
| {
| return float(A != C && A != D && B == C && B == D)-float(A == C && A == D);
| }
|
| void main()
| {
|
| // Calculating texel coordinates
|
| vec2 OGL2Pos = gl_TexCoord[0].xy*OGL2Size.xy,
| fp = fract(OGL2Pos),
| g1 = vec2( OGL2InvSize.x,OGL2InvSize.y),
| g2 = vec2(-OGL2InvSize.x,OGL2InvSize.y);
|
| if(fp.x >= .5 && fp.y < .5) g2=-g2;
|
| vec2 pC4 = floor(OGL2Pos)/OGL2Size.xy,
| g3 = .5*(g1-g2), pC8 = pC4+g1;
|
|
| // Reading the texels
|
| vec4 C0 = texture2D(OGL2Texture,pC4-g1 ),
| C1 = texture2D(OGL2Texture,pC4-g2-g3 ),
| C2 = texture2D(OGL2Texture,pC4-g2 ),
| C3 = texture2D(OGL2Texture,pC4-g3 ),
| C4 = texture2D(OGL2Texture,pC4 ),
| C5 = texture2D(OGL2Texture,pC4+g3 ),
| C6 = texture2D(OGL2Texture,pC4+g2 ),
| C7 = texture2D(OGL2Texture,pC8-g3 ),
| C8 = texture2D(OGL2Texture,pC8 ),
| D0 = texture2D(OGL2Texture,pC4+2.*g2+g3),
| D1 = texture2D(OGL2Texture,pC8+g2 ),
| D2 = texture2D(OGL2Texture,pC8+g1-g3 ),
| D4 = texture2D(OGL2Texture,pC8-g2 ),
| D5 = texture2D(OGL2Texture,pC8+g3 ),
| p10,p11;
|
| float c0 = reduce(C0),c1 = reduce(C1),c2 = reduce(C2),c3 = reduce(C3),
| c4 = reduce(C4),c5 = reduce(C5),c6 = reduce(C6),c7 = reduce(C7),
| c8 = reduce(C8),d0 = reduce(D0),d1 = reduce(D1),d2 = reduce(D2),
| d4 = reduce(D4),d5 = reduce(D5);
|
| /* SaI code */
| /* Copied from the Dosbox source code */
| /* Copyright (C) 2002-2007 The DOSBox Team */
| /* License: GNU-GPL */
| /* Adapted by guest(r) on 20.4 and 9.5. 2007 */
| /* and VerGreeneyes (10.5.2007) */
|
|
| float dif1 = dot(abs(C4-C8),dp), dif2 = dot(abs(C5-C7),dp);
|
| if(dif1 <= th)
| {
| if(dif2 > th)
| {
| p10 = (c4 == c3 && c7 == d2 || c4 == c5 && c4 == c6 && c3 != c7 && c7 == d0) ? C4 : .5*(C4+C7);
| p11 = C4;
| }
| else
| {
| if(c4 == c5) p11 = (p10 = C4);
| else
| {
| float r = GET_RESULT(c4,c5,c3,c1)+GET_RESULT(c4,c5,d5,d2)
| -GET_RESULT(c5,c4,d4,c2)-GET_RESULT(c5,c4,c6,d1);
| p10 = .5*(C4+C7);
| if(r > 0.) p11 = C4;
| else if(r < 0.) p11 = C5;
| else p11 = .25*(C4+C5+C7+C8);
| }
| }
| }
| else if(dif2 <= th)
| {
| p10 = (c7 == c6 && c4 == c2 || c7 == c3 && c7 == c8 && c4 != c6 && c4 == c0) ? C7 : 0.5*(C4+C7);
| p11 = C5;
| }
| else
| {
| p11 = 0.25*(C4+C5+C7+C8);
| if(c4 == c5 && c4 == c6 && c3 != c7 && c7 == d0) p10 = C4;
| else if(c7 == c3 && c7 == c8 && c4 != c6 && c4 == c0) p10 = C7;
| else p10 = 0.5*(C4+C7);
| }
|
| // Distributing the final products
|
| if(fp.x >= .5 && fp.y >= .5) gl_FragColor = p11;
| else if(fp.x >= .5 || fp.y >= .5) gl_FragColor = p10;
| else gl_FragColor = C4;
| }
| | |
[Dieser Beitrag wurde am 10.05.2007 - 20:36 von VerGreeneyes aktualisiert]
|
VerGreeneyes Strong supporter
 

Status:Offline Date registered: 26.04.2007 Post:89 Send Message | Created on 10.05.2007 - 22:44 |  |
Hmm, I see you removed your post. Was the experimental build not working right? Also I was wondering, how big are the minute variations in colour we have to take into account? If it's just rounding errors, that can be solved by rounding the input values though it also kinda kills the speed.
Edit: just an update, here's a version I made that rounds the values. I found a few ways to speed things up a little since the last version so the speed hit isn't so big.. but I don't know if it changes anything, because I have no scenes to check it in! So let me know if this helps make the filter less glitchy, or if a more drastic approach is needed (like yours).
Fragment file: | Code: | | 1: | | 2: | | 3: | | 4: | | 5: | | 6: | | 7: | | 8: | | 9: | | 10: | | 11: | | 12: | | 13: | | 14: | | 15: | | 16: | | 17: | | 18: | | 19: | | 20: | | 21: | | 22: | | 23: | | 24: | | 25: | | 26: | | 27: | | 28: | | 29: | | 30: | | 31: | | 32: | | 33: | | 34: | | 35: | | 36: | | 37: | | 38: | | 39: | | 40: | | 41: | | 42: | | 43: | | 44: | | 45: | | 46: | | 47: | | 48: | | 49: | | 50: | | 51: | | 52: | | 53: | | 54: | | 55: | | 56: | | 57: | | 58: | | 59: | | 60: | | 61: | | 62: | | 63: | | 64: | | 65: | | 66: | | 67: | | 68: | | 69: | | 70: | | 71: | | 72: | | 73: | | 74: | | 75: | | 76: | | 77: | | 78: | | 79: | | 80: | | 81: | | 82: | | 83: | | 84: | | 85: | | 86: | | 87: | | 88: | | 89: | | 90: | | 91: | | 92: | | 93: | | 94: | | 95: | | 96: | | 97: | | 98: | | 99: | | 100: | | 101: | | 102: | | 103: | | 104: | | 105: | | 106: | | 107: | | 108: | | 109: | | 110: | | 111: | | 112: | | 113: | | | | /*
| 2xSaI GLSL shader
|
| - Copyright (C) 2007 guest(r) - guest.r@gmail.com
|
| - License: GNU-GPL
|
| - Enhanced by Ver Greeneyes (10.5.2007)
|
| The 2xSaI algorithm
|
| - Copyright (c) 1999-2001 by Derek Liauw Kie Fa.
| */
|
| const vec4 dt = vec4(65536.,256.,1.,.00390625);
| uniform vec4 OGL2Size, OGL2InvSize;
| uniform sampler2D OGL2Texture;
|
| #define reduce(color)(dot(color,dt))
|
| float GET_RESULT(float A,float B,float C,float D)
| {
| return float(A != C && A != D && B == C && B == D)-float(A == C && A == D);
| }
|
| void main()
| {
|
| // Calculating texel coordinates
|
| vec2 OGL2Pos = gl_TexCoord[0].xy*OGL2Size.xy,
| fp = fract(OGL2Pos),
| g1 = vec2( OGL2InvSize.x,OGL2InvSize.y),
| g2 = vec2(-OGL2InvSize.x,OGL2InvSize.y);
|
| if(fp.x >= .5 && fp.y < .5) g2=-g2;
|
| vec2 pC4 = floor(OGL2Pos)/OGL2Size.xy,
| g3 = .5*(g1-g2), pC8 = pC4+g1, g4 = g2+g3;
|
|
| // Reading the texels
|
| vec4 C0 = floor(256.*texture2D(OGL2Texture,pC4-g1 )+.5),
| C1 = floor(256.*texture2D(OGL2Texture,pC4-g4 )+.5),
| C2 = floor(256.*texture2D(OGL2Texture,pC4-g2 )+.5),
| C3 = floor(256.*texture2D(OGL2Texture,pC4-g3 )+.5),
| C4 = floor(256.*texture2D(OGL2Texture,pC4 )+.5),
| C5 = floor(256.*texture2D(OGL2Texture,pC4+g3 )+.5),
| C6 = floor(256.*texture2D(OGL2Texture,pC4+g2 )+.5),
| C7 = floor(256.*texture2D(OGL2Texture,pC8-g3 )+.5),
| C8 = floor(256.*texture2D(OGL2Texture,pC8 )+.5),
| D0 = floor(256.*texture2D(OGL2Texture,pC4+g2+g4)+.5),
| D1 = floor(256.*texture2D(OGL2Texture,pC8+g2 )+.5),
| D2 = floor(256.*texture2D(OGL2Texture,pC8+g1-g3)+.5),
| D4 = floor(256.*texture2D(OGL2Texture,pC8-g2 )+.5),
| D5 = floor(256.*texture2D(OGL2Texture,pC8+g3 )+.5),
| p10,p11;
|
| float c0 = reduce(C0),c1 = reduce(C1),c2 = reduce(C2),c3 = reduce(C3),
| c4 = reduce(C4),c5 = reduce(C5),c6 = reduce(C6),c7 = reduce(C7),
| c8 = reduce(C8),d0 = reduce(D0),d1 = reduce(D1),d2 = reduce(D2),
| d4 = reduce(D4),d5 = reduce(D5);
|
|
| /* SaI code */
| /* Copied from the Dosbox source code */
| /* Copyright (C) 2002-2007 The DOSBox Team */
| /* License: GNU-GPL */
| /* Adapted by guest(r) on 20.4 and 9.5. 2007 */
| /* and Ver Greeneyes (10.5.2007) */
|
| if(c4 == c8)
| {
| if(c5 != c7)
| {
| p10 = (c4 == c3 && c7 == d2 || c4 == c5 && c4 == c6 && c3 != c7 && c7 == d0) ? C4 : .5*(C4+C7);
| p11 = C4;
| }
| else
| {
| if(c4 == c5) p11 = (p10 = C4);
| else
| {
| float r = GET_RESULT(c4,c5,c3,c1)+GET_RESULT(c4,c5,d5,d2)
| -GET_RESULT(c5,c4,d4,c2)-GET_RESULT(c5,c4,c6,d1);
| p10 = .5*(C4+C7);
| if(r > 0.) p11 = C4;
| else if(r < 0.) p11 = C5;
| else p11 = .25*(C4+C5+C7+C8);
| }
| }
| }
| else if(c5 == c7)
| {
| p10 = (c7 == c6 && c4 == c2 || c7 == c3 && c7 == c8 && c4 != c6 && c4 == c0) ? C7 : 0.5*(C4+C7);
| p11 = C5;
| }
| else
| {
| p11 = 0.25*(C4+C5+C7+C8);
| if(c4 == c5 && c4 == c6 && c3 != c7 && c7 == d0) p10 = C4;
| else if(c7 == c3 && c7 == c8 && c4 != c6 && c4 == c0) p10 = C7;
| else p10 = 0.5*(C4+C7);
| }
|
| // Distributing the final products
|
| if(fp.x >= .5 && fp.y >= .5) gl_FragColor = p11/256.;
| else if(fp.x >= .5 || fp.y >= .5) gl_FragColor = p10/256.;
| else gl_FragColor = C4/256.;
| }
| | |
PS: If you want to test the speed, taking out the rounding changes should be simple enough. (just don't forget to take out the divisions at the bottom!)
[Dieser Beitrag wurde am 11.05.2007 - 01:15 von VerGreeneyes aktualisiert]
|