首先解决循环问题,只要把四个相同的矩阵拼起来就好

然后在新的大矩阵上求长宽最大为n的最大子矩阵和,最大子矩阵和可以转化成最大子段和问题(Uva108),因此这个问题可以转为在长度为m的数列上求一个长度至多为n的最大子段和,这个处理一个前缀和就可以O(n^2)求,具体形式为sum[i]-sum[j](i-j+1<=n),用数据结构维护sum[j]的话为O(nlogn),然而用单调队列维护sum[j]是O(n)的,总复杂度O(n^3),

#include <bits/stdc++.h>
using namespace std;
const int maxn = 500;
const int INF = 0x3f3f3f3f;
int t, n, m, temp[maxn][maxn], a[maxn][maxn], sum[maxn][maxn], b[maxn], dp[maxn], ans;
struct Queue {
int p, v;
}que[maxn];
void solve() {
dp[0] = 0;
for (int i = 1; i <= m; i++) dp[i] = dp[i-1] + b[i];
int l = 1, r = 1; que[1].p = 1, que[1].v = 0;
for (int i = 1; i <= m; i++) {
while (l <= r && i-que[l].p+1 > n) l++;
ans = max(ans, dp[i] - que[l].v);
while (l <= r && dp[i] <= que[r].v) r--;
++r;
que[r].p = i;
que[r].v = dp[i];
}
}
int main() {
scanf("%d", &t);
while (t--) {
scanf("%d", &n);
for (int i = 1; i <= n; i++) {
for (int j = 1; j <= n; j++) scanf("%d", &temp[i][j]);
}
for (int i = 1; i <= 2; i++) {
for (int j = 1; j <= 2; j++) {
for (int x = 1; x <= n; x++) {
for (int y = 1; y <= n; y++) {
a[(i-1)*n+x][(j-1)*n+y] = temp[x][y];
}
}
}
}
m = n*2;
for (int i = 1; i <= m; i++) {
for (int j = 1; j <= m; j++) {
sum[i][j] = sum[i-1][j]+a[i][j];
}
}
ans = -INF;
for (int k = 1; k <= n; k++) {
for (int s = 1; s+k-1 <= m; s++) {
int t = s+k-1;
for (int i = 1; i <= m; i++) {
b[i] = sum[t][i] - sum[s-1][i];
}
solve();
}
}
printf("%d\n", ans);
}
return 0;
}