用四边形优化一下dp即可。。。。
#include <bits/stdc++.h>
using namespace std;
typedef long long LL;
const int maxm = 808;
const int maxn = 4004;
const int INF = 0x3f3f3f3f;
int f[maxm][maxn];
int s[maxm][maxn];
int a[maxn][maxn];
int n, m;
void read(int &x)
{
x = 0;
char ch = getchar();
while(ch == ' ' || ch == '\n') ch = getchar();
while(ch != ' ' && ch != '\n') x = x * 10 + ch - '0', ch = getchar();
}
int calc(int i, int j)
{
return a[j][j] - a[i-1][j] - a[j][i-1] + a[i-1][i-1];
}
void work()
{
scanf("%d%d", &n, &m);
for(int i = 1; i <= n; i++)
for(int j = 1; j <= n; j++)
read(a[i][j]);
for(int i = 1; i <= n; i++)
for(int j = 1; j <= n; j++)
a[i][j] += a[i][j-1];
for(int i = 1; i <= n; i++)
for(int j = 1; j <= n; j++)
a[i][j] += a[i-1][j];
memset(f, INF, sizeof f);
f[0][0] = 0;
for(int i = 1; i <= m; i++)
for(int j = n; j >= 1; j--) {
for(int k = s[i-1][j]; k <= (j == n ? n : s[i][j+1]) && k < j; k++) {
if(f[i][j] > f[i-1][k] + calc(k+1, j)) {
f[i][j] = f[i-1][k] + calc(k+1, j);
s[i][j] = k;
}
}
}
printf("%d\n", f[m][n] / 2);
}
int main()
{
//freopen("data", "r", stdin);
work();
return 0;
}