| Author | Topics » Book an abo for this thread |  |
VerGreeneyes Strong supporter
 

Status:Offline Date registered: 26.04.2007 Post:89 Send Message | Created on 14.05.2007 - 16:59 |  |
Hmm, well I should be able to do some optimisation on that front; don't expect to get anything approaching full speed though, this is always going to be atleast four times slower than the 2xSaI filter by itself. (and there's some additional overhead) I fixed the lines issue though, and made it a bit smaller too.. maybe this version will work on your card (though I doubt there's much of a difference).
Fragment file: | Code: | | 1: | | 2: | | 3: | | 4: | | 5: | | 6: | | 7: | | 8: | | 9: | | 10: | | 11: | | 12: | | 13: | | 14: | | 15: | | 16: | | 17: | | 18: | | 19: | | 20: | | 21: | | 22: | | 23: | | 24: | | 25: | | 26: | | 27: | | 28: | | 29: | | 30: | | 31: | | 32: | | 33: | | 34: | | 35: | | 36: | | 37: | | 38: | | 39: | | 40: | | 41: | | 42: | | 43: | | 44: | | 45: | | 46: | | 47: | | 48: | | 49: | | 50: | | 51: | | 52: | | 53: | | 54: | | 55: | | 56: | | 57: | | 58: | | 59: | | 60: | | 61: | | 62: | | 63: | | 64: | | 65: | | 66: | | 67: | | 68: | | 69: | | 70: | | 71: | | 72: | | 73: | | 74: | | 75: | | 76: | | 77: | | 78: | | 79: | | 80: | | 81: | | 82: | | 83: | | 84: | | 85: | | 86: | | 87: | | 88: | | 89: | | 90: | | 91: | | 92: | | 93: | | 94: | | 95: | | 96: | | 97: | | 98: | | | | uniform vec4 OGL2Size, OGL2InvSize;
| uniform sampler2D OGL2Texture;
| const float pi = 1.570796326794896619231321691640;
| const vec4 vec1 = vec4(1.);
| const vec4 dt = vec4(16777216.,65536.,256.,1.);
|
| float reduce(vec4);
| float GET_RESULT(float, float, float, float);
| vec4 xSaI(vec2, vec2);
|
| void main(){
| vec2 OGL2Pos = gl_TexCoord[0].xy*OGL2Size.xy,
| fp = fract(OGL2Pos),
| dx = vec2(OGL2InvSize.x,0.),
| dy = vec2(0.,OGL2InvSize.y),
| pC4 = floor(OGL2Pos)/OGL2Size.xy,
| s0 = vec2(float(fp.x< .5),float(fp.y< .5)),
| s1 = vec2(float(fp.x< .5),float(fp.y>=.5)),
| s2 = vec2(float(fp.x>=.5),float(fp.y< .5)),
| s3 = vec2(float(fp.x>=.5),float(fp.y>=.5));
| if(fp.x >= .5){fp.x = 1. - fp.x; dx = -dx;}
| if(fp.y >= .5){fp.y = 1. - fp.y; dy = -dy;}
| fp *= 2.;
| if(fp.x >= .5){fp.x = 1. - fp.x; dx = vec2(0.);}
| if(fp.y >= .5){fp.y = 1. - fp.y; dy = vec2(0.);}
| mat4 C = mat4(xSaI(s0,pC4-dx-dy),xSaI(s1,pC4-dx),
| xSaI(s2,pC4-dy),xSaI(s3,pC4));
| mat2 gp = mat2((fp+.5)*(fp+.5),(fp-.5)*(fp-.5));
| vec4 c = vec4(gp[0][0]+gp[0][1],gp[0][0]+gp[1][1],
| gp[1][0]+gp[0][1],gp[1][0]+gp[1][1] ;
| c = vec1 - sqrt(c);
| c *= vec4(greaterThan(c,vec4(0.)));
| c = -cos(pi*(c+vec1));
| gl_FragColor = (C[0]*c.x+C[1]*c.y+C[2]*c.z+C[3]*c.w)/(c.x+c.y+c.z+c.w);
| }
|
| float reduce(vec4 colour){
| return dot(colour,dt);
| }
|
| float GET_RESULT(float A, float B, float C, float D){
| return float(A != C && A != D && B == C && B == D) - float(A == C && A == D);
| }
|
| vec4 xSaI(vec2 fp, vec2 pC4){
| vec4 rValue;
| vec2 g1 = vec2( OGL2InvSize.x,OGL2InvSize.y),
| g2 = vec2(-OGL2InvSize.x,OGL2InvSize.y);
| if(fp.x >= .5 && fp.y < .5) g2=-g2;
| vec2 g3 = .5*(g1-g2), pC8 = pC4+g1, g4 = g2+g3;
| vec4 C0 = texture2D(OGL2Texture,pC4-g1 ),
| C1 = texture2D(OGL2Texture,pC4-g4 ),
| C2 = texture2D(OGL2Texture,pC4-g2 ),
| C3 = texture2D(OGL2Texture,pC4-g3 ),
| C4 = texture2D(OGL2Texture,pC4 ),
| C5 = texture2D(OGL2Texture,pC4+g3 ),
| C6 = texture2D(OGL2Texture,pC4+g2 ),
| C7 = texture2D(OGL2Texture,pC8-g3 ),
| C8 = texture2D(OGL2Texture,pC8 ),
| D0 = texture2D(OGL2Texture,pC4+g2+g4),
| D1 = texture2D(OGL2Texture,pC8+g2 ),
| D2 = texture2D(OGL2Texture,pC8+g1-g3),
| D4 = texture2D(OGL2Texture,pC8-g2 ),
| D5 = texture2D(OGL2Texture,pC8+g3 ),
| p10,p11;
| float c0 = reduce(C0),c1 = reduce(C1),c2 = reduce(C2),c3 = reduce(C3),
| c4 = reduce(C4),c5 = reduce(C5),c6 = reduce(C6),c7 = reduce(C7),
| c8 = reduce(C8),d0 = reduce(D0),d1 = reduce(D1),d2 = reduce(D2),
| d4 = reduce(D4),d5 = reduce(D5);
| if(c4 == c8){
| if(c5 != c7){
| p10 = (c4 == c3 && c7 == d2 || c4 == c5 && c4 == c6 && c3 != c7 && c7 == d0) ? C4 : .5*(C4+C7);
| p11 = C4;
| }else{
| if(c4 == c5) p11 = (p10 = C4);
| else{
| float r = GET_RESULT(c4,c5,c3,c1)+GET_RESULT(c4,c5,d5,d2)
| -GET_RESULT(c5,c4,d4,c2)-GET_RESULT(c5,c4,c6,d1);
| p10 = .5*(C4+C7);
| if(r > 0.) p11 = C4;
| else if(r < 0.) p11 = C5;
| else p11 = .25*(C4+C5+C7+C8);
| }
| }
| }else if(c5 == c7){
| p10 = (c7 == c6 && c4 == c2 || c7 == c3 && c7 == c8 && c4 != c6 && c4 == c0) ? C7 : 0.5*(C4+C7);
| p11 = C5;
| }else{
| p11 = 0.25*(C4+C5+C7+C8);
| if(c4 == c5 && c4 == c6 && c3 != c7 && c7 == d0) p10 = C4;
| else if(c7 == c3 && c7 == c8 && c4 != c6 && c4 == c0) p10 = C7;
| else p10 = 0.5*(C4+C7);
| }
| if(fp.x >= .5 && fp.y >= .5) rValue = p11;
| else if(fp.x >= .5 || fp.y >= .5) rValue = p10;
| else rValue = C4;
| return rValue;
| | } | |
Edit: current version does 14*4 = 56 texture lookups. I should be able to cut that down to 35 atleast.
[Dieser Beitrag wurde am 14.05.2007 - 17:37 von VerGreeneyes aktualisiert]
|
|
|
guest  Real addict
  

Status:Offline Date registered: 30.07.2004 Post:856 Send Message | Created on 14.05.2007 - 18:16 |  |
This code could hopefully work with an reduced set of lookups and with 4x2 branches of the xSaI func. (with 4 calls realized and with colors as input data).
The code
| | | |
| mat4 C = mat4(xSaI(s0,pC4-dx-dy),xSaI(s1,pC4-dx),
| xSaI(s2,pC4-dy),xSaI(s3,pC4));
| | |
points to a max. 1 texel extending to the left and up.
I think it can be done with 25 lookups but i dunno what ammount of aditional spam code is actually generated with the Ati compiler. 
[Dieser Beitrag wurde am 15.05.2007 - 17:28 von guest aktualisiert]
|
VerGreeneyes Strong supporter
 

Status:Offline Date registered: 26.04.2007 Post:89 Send Message | Created on 14.05.2007 - 21:04 |  |
Well, it turns out GLSL is mean when it comes to arrays.. it errors out on me and I don't know how to fix it. So there goes the option I wanted to try *sigh*
Edit: here's what I was trying: | Code: | | 1: | | 2: | | 3: | | 4: | | 5: | | 6: | | 7: | | 8: | | 9: | | 10: | | 11: | | 12: | | 13: | | 14: | | 15: | | 16: | | 17: | | 18: | | 19: | | 20: | | 21: | | 22: | | 23: | | 24: | | 25: | | 26: | | 27: | | 28: | | 29: | | 30: | | 31: | | 32: | | 33: | | 34: | | 35: | | 36: | | 37: | | 38: | | 39: | | 40: | | 41: | | 42: | | 43: | | 44: | | 45: | | 46: | | 47: | | 48: | | 49: | | 50: | | 51: | | 52: | | 53: | | 54: | | 55: | | 56: | | 57: | | 58: | | 59: | | 60: | | 61: | | 62: | | 63: | | 64: | | 65: | | 66: | | 67: | | 68: | | 69: | | 70: | | 71: | | 72: | | 73: | | 74: | | 75: | | 76: | | 77: | | 78: | | 79: | | 80: | | 81: | | 82: | | 83: | | 84: | | 85: | | 86: | | 87: | | 88: | | 89: | | 90: | | 91: | | 92: | | 93: | | 94: | | 95: | | 96: | | 97: | | 98: | | 99: | | 100: | | 101: | | 102: | | 103: | | 104: | | 105: | | 106: | | 107: | | 108: | | 109: | | 110: | | 111: | | 112: | | 113: | | 114: | | 115: | | 116: | | 117: | | 118: | | 119: | | 120: | | 121: | | 122: | | 123: | | 124: | | 125: | | 126: | | 127: | | | | uniform vec4 OGL2Size, OGL2InvSize;
| uniform sampler2D OGL2Texture;
| const float pi = 1.570796326794896619231321691640;
| const vec2 pC4 = gl_TexCoord[0].xy;
| vec2 dx = vec2(OGL2InvSize.x,0.),
| dy = vec2(0.,OGL2InvSize.y);
| const vec4 vec1 = vec4(1.);
| const vec4 dt = vec4(16777216.,65536.,256.,1.);
| vec4 T[35] = {texture2D(OGL2Texture,pC4-2.*dx-2.*dy), texture2D(OGL2Texture,pC4- dx-2.*dy),
| texture2D(OGL2Texture,pC4 -2.*dy), texture2D(OGL2Texture,pC4+ dx-2.*dy),
| texture2D(OGL2Texture,pC4+2.*dx-2.*dy), texture2D(OGL2Texture,pC4+3.*dx-2.*dy),
| texture2D(OGL2Texture,pC4-2.*dx- dy), texture2D(OGL2Texture,pC4- dx- dy),
| texture2D(OGL2Texture,pC4 - dy), texture2D(OGL2Texture,pC4+ dx- dy),
| texture2D(OGL2Texture,pC4+2.*dx- dy), texture2D(OGL2Texture,pC4+3.*dx- dy),
| texture2D(OGL2Texture,pC4-2.*dx ), texture2D(OGL2Texture,pC4- dx ),
| texture2D(OGL2Texture,pC4 ), texture2D(OGL2Texture,pC4+ dx ),
| texture2D(OGL2Texture,pC4+2.*dx ), texture2D(OGL2Texture,pC4+3.*dx ),
| texture2D(OGL2Texture,pC4-2.*dx+ dy), texture2D(OGL2Texture,pC4- dx+ dy),
| texture2D(OGL2Texture,pC4 + dy), texture2D(OGL2Texture,pC4+ dx+ dy),
| texture2D(OGL2Texture,pC4+2.*dx+ dy), texture2D(OGL2Texture,pC4+3.*dx+ dy),
| texture2D(OGL2Texture,pC4-2.*dx+2.*dy), texture2D(OGL2Texture,pC4- dx+2.*dy),
| texture2D(OGL2Texture,pC4 +2.*dy), texture2D(OGL2Texture,pC4+ dx+2.*dy),
| texture2D(OGL2Texture,pC4+2.*dx+2.*dy), texture2D(OGL2Texture,pC4+3.*dx+2.*dy),
| texture2D(OGL2Texture,pC4-2.*dx+3.*dy), texture2D(OGL2Texture,pC4- dx+3.*dy),
| texture2D(OGL2Texture,pC4 +3.*dy), texture2D(OGL2Texture,pC4+ dx+3.*dy),
| texture2D(OGL2Texture,pC4+2.*dx+3.*dy)};
|
| float reduce(vec4);
| float GET_RESULT(float, float, float, float);
| vec4 xSaI(vec2, int);
|
| void main(){
| vec2 fp = fract(pC4*OGL2Size.xy),
| s0 = vec2(float(fp.x< .5),float(fp.y< .5)),
| s1 = vec2(float(fp.x< .5),float(fp.y>=.5)),
| s2 = vec2(float(fp.x>=.5),float(fp.y< .5)),
| s3 = vec2(float(fp.x>=.5),float(fp.y>=.5));
| if(fp.x >= .5){fp.x = 1. - fp.x; dx = -dx;}
| if(fp.y >= .5){fp.y = 1. - fp.y; dy = -dy;}
| fp *= 2.;
| if(fp.x >= .5){fp.x = 1. - fp.x; dx = vec2(0.);}
| if(fp.y >= .5){fp.y = 1. - fp.y; dy = vec2(0.);}
| mat4 C = mat4(xSaI(s0,int(sign(-dx.x)+6.*sign(-dy.y))),
| xSaI(s1,int(sign(-dx.x) )),
| xSaI(s2,int( 6.*sign(-dy.y))),
| xSaI(s3,0 ));
| mat2 gp = mat2((fp+.5)*(fp+.5),(fp-.5)*(fp-.5));
| vec4 c = vec4(gp[0][0]+gp[0][1],gp[0][0]+gp[1][1],
| gp[1][0]+gp[0][1],gp[1][0]+gp[1][1] ;
| c = vec1 - sqrt(c);
| c *= vec4(greaterThan(c,vec4(0.)));
| c = -cos(pi*(c+vec1));
| gl_FragColor = (C[0]*c.x+C[1]*c.y+C[2]*c.z+C[3]*c.w)/(c.x+c.y+c.z+c.w);
| }
|
| float reduce(vec4 colour){
| return dot(colour,dt);
| }
|
| float GET_RESULT(float A, float B, float C, float D){
| return float(A != C && A != D && B == C && B == D) - float(A == C && A == D);
| }
|
| vec4 xSaI(vec2 fp, int v){
| vec4 rValue;
| vec4 C0,C1,C2,C3,C4,C5,C6,C7,C8,D0,D1,D2,D4,D5;
| C0 = T[ 7+v];
| C4 = T[14+v];
| C8 = T[21+v];
| D5 = T[22+v];
| if(fp.x >= .5 && fp.y < .5){
| C1 = T[13+v];
| C2 = T[19+v];
| C3 = T[ 8+v];
| C5 = T[20+v];
| C6 = T[ 9+v];
| C7 = T[15+v];
| D0 = T[10+v];
| D1 = T[16+v];
| D2 = T[22+v];
| D4 = T[26+v];
| }else{
| C1 = T[ 8+v];
| C2 = T[ 9+v];
| C3 = T[13+v];
| C5 = T[15+v];
| C6 = T[19+v];
| C7 = T[20+v];
| D0 = T[22+v];
| D1 = T[26+v];
| D2 = T[27+v];
| D4 = T[16+v];
| }
| vec4 p10,p11;
| float c0 = reduce(C0),c1 = reduce(C1),c2 = reduce(C2),c3 = reduce(C3),
| c4 = reduce(C4),c5 = reduce(C5),c6 = reduce(C6),c7 = reduce(C7),
| c8 = reduce(C8),d0 = reduce(D0),d1 = reduce(D1),d2 = reduce(D2),
| d4 = reduce(D4),d5 = reduce(D5);
| if(c4 == c8){
| if(c5 != c7){
| p10 = (c4 == c3 && c7 == d2 || c4 == c5 && c4 == c6 && c3 != c7 && c7 == d0) ? C4 : .5*(C4+C7);
| p11 = C4;
| }else{
| if(c4 == c5) p11 = (p10 = C4);
| else{
| float r = GET_RESULT(c4,c5,c3,c1)+GET_RESULT(c4,c5,d5,d2)
| -GET_RESULT(c5,c4,d4,c2)-GET_RESULT(c5,c4,c6,d1);
| p10 = .5*(C4+C7);
| if(r > 0.) p11 = C4;
| else if(r < 0.) p11 = C5;
| else p11 = .25*(C4+C5+C7+C8);
| }
| }
| }else if(c5 == c7){
| p10 = (c7 == c6 && c4 == c2 || c7 == c3 && c7 == c8 && c4 != c6 && c4 == c0) ? C7 : 0.5*(C4+C7);
| p11 = C5;
| }else{
| p11 = 0.25*(C4+C5+C7+C8);
| if(c4 == c5 && c4 == c6 && c3 != c7 && c7 == d0) p10 = C4;
| else if(c7 == c3 && c7 == c8 && c4 != c6 && c4 == c0) p10 = C7;
| else p10 = 0.5*(C4+C7);
| }
| if(fp.x >= .5 && fp.y >= .5) rValue = p11;
| else if(fp.x >= .5 || fp.y >= .5) rValue = p10;
| else rValue = C4;
| return rValue;
| | } | |
[Dieser Beitrag wurde am 14.05.2007 - 21:19 von VerGreeneyes aktualisiert]
|
guest  Real addict
  

Status:Offline Date registered: 30.07.2004 Post:856 Send Message | Created on 15.05.2007 - 09:51 |  |
The GLSpec book v.1.10 states (page 21) that there is no mechanism for initializing arrays at declaration time from within a shader.
If the upper code works for you that means nVidia guys take "black, but slim possibility of gray" as "we'll make it white". Shame on them. 
Indexing an array with an integer variable (still within boundaries) crashes the app. 
|
VerGreeneyes Strong supporter
 

Status:Offline Date registered: 26.04.2007 Post:89 Send Message | Created on 15.05.2007 - 13:22 |  |
Yeah, I know the code doesn't work, although I was curious whether or not it would work with the ATI compiler. To be honest I'm not sure how to go about reducing the amount of texture lookups now (atleast without -completely- killing performance)
Edit: hang on, you can't even -initialise- it? Jeez, they really don't like arrays huh..
[Dieser Beitrag wurde am 15.05.2007 - 13:24 von VerGreeneyes aktualisiert]
|
guest  Real addict
  

Status:Offline Date registered: 30.07.2004 Post:856 Send Message | Created on 15.05.2007 - 15:46 |  |
I guess there are two more options left...
1. Wait for the multipass feature (a nice adaptable implementation would be cool )
2. Try to think it out. (much work)
Or you can try something new. 
|
VerGreeneyes Strong supporter
 

Status:Offline Date registered: 26.04.2007 Post:89 Send Message | Created on 15.05.2007 - 17:41 |  |
A multipass solution would be great, something like a list of filters with the amount of upscaling they're allowed to do.. *drool* I'm also hoping the SNES emulator bsnes will implement OGL2 soonish; the author (byuu) has said he really likes the idea but doesn't really know where to start.
|
guest  Real addict
  

Status:Offline Date registered: 30.07.2004 Post:856 Send Message | Created on 15.05.2007 - 20:15 |  |
As i recall VBA has OGL support.
Peeking into it's source could reveal a mystery or two.
Nevertheless at least a 2 pass support would be fine - with a simple linear last shader to fit the image.
|
VerGreeneyes Strong supporter
 

Status:Offline Date registered: 26.04.2007 Post:89 Send Message | Created on 15.05.2007 - 20:42 |  |
Yeah, it's really just single pass that's a pain. By the way, I've been thinking a bit about the problem of reducing the amount of texture lookups used, and one possibility I see must distinguish between nine distinct scenarios:
| Code: | | 1: | | 2: | | 3: | | 4: | | 5: | | 6: | | 7: | | 8: | | 9: | | | | dx.x < 0.0 && dy.y < 0.0,
| dx.x < 0.0 && dy.y == 0.0,
| dx.x < 0.0 && dy.y > 0.0,
| dx.x == 0.0 && dy.y < 0.0,
| dx.x == 0.0 && dy.y == 0.0,
| dx.x == 0.0 && dy.y > 0.0,
| dx.x > 0.0 && dy.y < 0.0,
| dx.x > 0.0 && dy.y == 0.0 and
| | dx.x > 0.0 && dy.y > 0.0. | | Implementing this would obviously result in a big speed loss, but maybe it's worth it.. doing so might point the way to better solutions, as well.
[Dieser Beitrag wurde am 15.05.2007 - 21:00 von VerGreeneyes aktualisiert]
|
guest  Real addict
  

Status:Offline Date registered: 30.07.2004 Post:856 Send Message | Created on 15.05.2007 - 21:15 |  |
I would try the color-reduced Scale2x variant - requires 4x5 = 20 total lookups.
It would be a nice scale and sharp enough for the more blurry "average weigth filter".
Link to the shader.
PS:
The color reduction can be OTOH made much nicer in a manner that doesen't spoil the pallete too much:
| Code: | | 1: | | 2: | | 3: | | 4: | | 5: | | 6: | | 7: | | 8: | | 9: | | 10: | | 11: | | 12: | | | |
| uniform sampler2D OGL2Texture;
|
| void main()
| {
| vec3 c11 = texture2D(OGL2Texture, gl_TexCoord[0].xy).xyz;
|
| c11 = floor(8.0*sqrt(c11))*0.125; c11*=c11;
|
| gl_FragColor.xyz=c11;
| }
| | |
[Dieser Beitrag wurde am 15.05.2007 - 22:50 von guest aktualisiert]
|